%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import roc_curve, auc, roc_auc_score
import re
from tqdm import tqdm
from scipy.sparse import csr_matrix
from scipy import sparse
import math
from sklearn.metrics import confusion_matrix
from sklearn import metrics
import operator
from sklearn.preprocessing import normalize
import os
from gensim.models import Word2Vec
import pickle
from chart_studio import plotly
import plotly.offline as offline
import plotly.graph_objs as go
offline.init_notebook_mode()
from collections import Counter
data = pd.read_csv('preprocessed_final.csv')
data.head(1)
| school_state | teacher_prefix | project_grade_category | teacher_number_of_previously_posted_projects | project_is_approved | clean_categories | clean_subcategories | essay | price | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | ca | mrs | grades_3_5 | 0 | 1 | literacy_language math_science | literacy mathematics | sitting still overrated it makes sense opera m... | 1418.08 |
data.shape
(54574, 9)
y = data['project_is_approved'].values
X = data.drop(['project_is_approved'], axis=1)
X.head(1)
| school_state | teacher_prefix | project_grade_category | teacher_number_of_previously_posted_projects | clean_categories | clean_subcategories | essay | price | |
|---|---|---|---|---|---|---|---|---|
| 0 | ca | mrs | grades_3_5 | 0 | literacy_language math_science | literacy mathematics | sitting still overrated it makes sense opera m... | 1418.08 |
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, stratify=y)
X_train, X_cv, y_train, y_cv = train_test_split(X_train, y_train, test_size=0.33, stratify=y_train)
print(X_train.shape, y_train.shape)
print(X_cv.shape, y_cv.shape)
print(X_test.shape, y_test.shape)
(24497, 8) (24497,) (12067, 8) (12067,) (18010, 8) (18010,)
#PERFORMING BOW ON ESSAY ATTRIBUTE
vectorizer = CountVectorizer(min_df=10,ngram_range=(1,4), max_features=5000)
vectorizer.fit(X_train['essay'].values)
X_train_essay_bow = vectorizer.transform(X_train['essay'].values)
X_cv_essay_bow = vectorizer.transform(X_cv['essay'].values)
X_test_essay_bow = vectorizer.transform(X_test['essay'].values)
print("After vectorizations")
print(X_train_essay_bow.shape, y_train.shape)
print(X_cv_essay_bow.shape, y_cv.shape)
print(X_test_essay_bow.shape, y_test.shape)
print("--"*50)
(24497, 8) (24497,) (12067, 8) (12067,) (18010, 8) (18010,) ==================================================================================================== After vectorizations (24497, 5000) (24497,) (12067, 5000) (12067,) (18010, 5000) (18010,) ====================================================================================================
Feature_arr = list()
Feature_arr[:5]
[]
#PERFORMING TFIDF ON ESSAY ATTRIBUTE
vectorizer = TfidfVectorizer(min_df=10,ngram_range=(1,4), max_features=5000)
vectorizer.fit(X_train['essay'].values)
X_train_essay_tfidf = vectorizer.transform(X_train['essay'].values)
X_cv_essay_tfidf = vectorizer.transform(X_cv['essay'].values)
X_test_essay_tfidf = vectorizer.transform(X_test['essay'].values)
print("After vectorizations")
print(X_train_essay_tfidf.shape, y_train.shape)
print(X_cv_essay_tfidf.shape, y_cv.shape)
print(X_test_essay_tfidf.shape, y_test.shape)
print("="*100)
After vectorizations (24497, 5000) (24497,) (12067, 5000) (12067,) (18010, 5000) (18010,) ====================================================================================================
def tfidf_w2v(w2v_corpus):
'''This function computes tfidf_w2v and returns vector'''
train_index = list(w2v_corpus.index)
final_w2v_corpus = [x.split(' ') for x in w2v_corpus[train_index[:]]]
final_tfidf_corpus = [x for x in w2v_corpus[train_index[:]]]
Idf_values = open("data.pkl","rb")
Idf_dict = pickle.load(Idf_values)
model = Word2Vec(final_w2v_corpus,min_count = 1)
if isinstance(final_tfidf_corpus,(list,)):
tfidf_w2v = list()
for idx,row in enumerate(tqdm(final_tfidf_corpus)):
sum1 = model.wv['this']
sum2 = sum1*0
word_freq = dict(Counter(row.split(' ')))
for word,freq in word_freq.items():
try:
sum2 += freq*Idf_dict[word]*model[word]
except:
continue
tfidf_w2v.append(sum2)
tfidf_w2v_array = np.array(tfidf_w2v)
tfidf_w2v_sparse = sparse.csr_matrix(tfidf_w2v_array)
output = normalize(tfidf_w2v_sparse, norm='l2', axis=1, copy=False, return_norm=False)
Idf_values.close()
return output
#PERFORMING TFIDFW2V ON ESSAY ATTRIBUTE
X_train_essay_tfidfw2v = tfidf_w2v(X_train['essay'])
X_cv_essay_tfidfw2v = tfidf_w2v(X_cv['essay'])
X_test_essay_tfidfw2v = tfidf_w2v(X_test['essay'])
100%|█████████████████████████████████████████████████████████████████████████| 24497/24497 [00:02<00:00, 10093.74it/s] 100%|█████████████████████████████████████████████████████████████████████████| 12067/12067 [00:01<00:00, 11381.78it/s] 100%|█████████████████████████████████████████████████████████████████████████| 18010/18010 [00:01<00:00, 11205.43it/s]
#ENCODING NUMERICAL AND CATEGORICAL DATA
vectorizer = CountVectorizer()
vectorizer.fit(X_train['school_state'].values)
X_train_state_ohe = vectorizer.transform(X_train['school_state'].values)
X_cv_state_ohe = vectorizer.transform(X_cv['school_state'].values)
X_test_state_ohe = vectorizer.transform(X_test['school_state'].values)
print("After vectorizations")
print(X_train_state_ohe.shape, y_train.shape)
print(X_cv_state_ohe.shape, y_cv.shape)
print(X_test_state_ohe.shape, y_test.shape)
print(vectorizer.get_feature_names())
print("="*100)
After vectorizations (24497, 51) (24497,) (12067, 51) (12067,) (18010, 51) (18010,) ['ak', 'al', 'ar', 'az', 'ca', 'co', 'ct', 'dc', 'de', 'fl', 'ga', 'hi', 'ia', 'id', 'il', 'in', 'ks', 'ky', 'la', 'ma', 'md', 'me', 'mi', 'mn', 'mo', 'ms', 'mt', 'nc', 'nd', 'ne', 'nh', 'nj', 'nm', 'nv', 'ny', 'oh', 'ok', 'or', 'pa', 'ri', 'sc', 'sd', 'tn', 'tx', 'ut', 'va', 'vt', 'wa', 'wi', 'wv', 'wy'] ====================================================================================================
vectorizer = CountVectorizer()
vectorizer.fit(X_train['teacher_prefix'].values) # fit has to happen only on train data
# we use the fitted CountVectorizer to convert the text to vector
X_train_teacher_ohe = vectorizer.transform(X_train['teacher_prefix'].values)
X_cv_teacher_ohe = vectorizer.transform(X_cv['teacher_prefix'].values)
X_test_teacher_ohe = vectorizer.transform(X_test['teacher_prefix'].values)
Feature_arr.extend(vectorizer.get_feature_names())
print("After vectorizations")
print(X_train_teacher_ohe.shape, y_train.shape)
print(X_cv_teacher_ohe.shape, y_cv.shape)
print(X_test_teacher_ohe.shape, y_test.shape)
print(vectorizer.get_feature_names())
print("="*100)
After vectorizations (24497, 5) (24497,) (12067, 5) (12067,) (18010, 5) (18010,) ['dr', 'mr', 'mrs', 'ms', 'teacher'] ====================================================================================================
vectorizer = CountVectorizer()
vectorizer.fit(X_train['project_grade_category'].values) # fit has to happen only on train data
# we use the fitted CountVectorizer to convert the text to vector
X_train_grade_ohe = vectorizer.transform(X_train['project_grade_category'].values)
X_cv_grade_ohe = vectorizer.transform(X_cv['project_grade_category'].values)
X_test_grade_ohe = vectorizer.transform(X_test['project_grade_category'].values)
print("After vectorizations")
print(X_train_grade_ohe.shape, y_train.shape)
print(X_cv_grade_ohe.shape, y_cv.shape)
print(X_test_grade_ohe.shape, y_test.shape)
print("="*100)
After vectorizations (24497, 4) (24497,) (12067, 4) (12067,) (18010, 4) (18010,) ['grades_3_5', 'grades_6_8', 'grades_9_12', 'grades_prek_2'] ====================================================================================================
vectorizer = CountVectorizer()
vectorizer.fit(X_train['clean_categories'].values) # fit has to happen only on train data
# we use the fitted CountVectorizer to convert the text to vector
X_train_clean_cat_ohe = vectorizer.transform(X_train['clean_categories'].values)
X_cv_clean_cat_ohe = vectorizer.transform(X_cv['clean_categories'].values)
X_test_clean_cat_ohe = vectorizer.transform(X_test['clean_categories'].values)
Feature_arr.extend(vectorizer.get_feature_names())
print("After vectorizations")
print(X_train_clean_cat_ohe.shape, y_train.shape)
print(X_cv_clean_cat_ohe.shape, y_cv.shape)
print(X_test_clean_cat_ohe.shape, y_test.shape)
print(vectorizer.get_feature_names())
print("="*100)
After vectorizations (24497, 9) (24497,) (12067, 9) (12067,) (18010, 9) (18010,) ['appliedlearning', 'care_hunger', 'health_sports', 'history_civics', 'literacy_language', 'math_science', 'music_arts', 'specialneeds', 'warmth'] ====================================================================================================
vectorizer = CountVectorizer()
vectorizer.fit(X_train['clean_subcategories'].values) # fit has to happen only on train data
# we use the fitted CountVectorizer to convert the text to vector
X_train_clean_sub_cat_ohe = vectorizer.transform(X_train['clean_subcategories'].values)
X_cv_clean_sub_cat_ohe = vectorizer.transform(X_cv['clean_subcategories'].values)
X_test_clean_sub_cat_ohe = vectorizer.transform(X_test['clean_subcategories'].values)
Feature_arr.extend(vectorizer.get_feature_names())
print("After vectorizations")
print(X_train_clean_sub_cat_ohe.shape, y_train.shape)
print(X_cv_clean_sub_cat_ohe.shape, y_cv.shape)
print(X_test_clean_sub_cat_ohe.shape, y_test.shape)
print(vectorizer.get_feature_names())
print("="*100)
After vectorizations (24497, 30) (24497,) (12067, 30) (12067,) (18010, 30) (18010,) ['appliedsciences', 'care_hunger', 'charactereducation', 'civics_government', 'college_careerprep', 'communityservice', 'earlydevelopment', 'economics', 'environmentalscience', 'esl', 'extracurricular', 'financialliteracy', 'foreignlanguages', 'gym_fitness', 'health_lifescience', 'health_wellness', 'history_geography', 'literacy', 'literature_writing', 'mathematics', 'music', 'nutritioneducation', 'other', 'parentinvolvement', 'performingarts', 'socialsciences', 'specialneeds', 'teamsports', 'visualarts', 'warmth'] ====================================================================================================
from sklearn.preprocessing import Normalizer
normalizer = Normalizer()
normalizer.fit(X_train['price'].values.reshape(-1,1))
X_train_price_norm = normalizer.transform(X_train['price'].values.reshape(-1,1))
X_cv_price_norm = normalizer.transform(X_cv['price'].values.reshape(-1,1))
X_test_price_norm = normalizer.transform(X_test['price'].values.reshape(-1,1))
Feature_arr.append('price')
print("After vectorizations")
print(X_train_price_norm.shape, y_train.shape)
print(X_cv_price_norm.shape, y_cv.shape)
print(X_test_price_norm.shape, y_test.shape)
print("="*100)
After vectorizations (24497, 1) (24497,) (12067, 1) (12067,) (18010, 1) (18010,) ====================================================================================================
normalizer = Normalizer()
normalizer.fit(X_train['teacher_number_of_previously_posted_projects'].values.reshape(-1,1))
X_train_ppp_norm = normalizer.transform(X_train['teacher_number_of_previously_posted_projects'].values.reshape(-1,1))
X_cv_ppp_norm = normalizer.transform(X_cv['teacher_number_of_previously_posted_projects'].values.reshape(-1,1))
X_test_ppp_norm = normalizer.transform(X_test['teacher_number_of_previously_posted_projects'].values.reshape(-1,1))
Feature_arr.append('teacher_number_of_previously_posted_projects')
print("After vectorizations")
print(X_train_ppp_norm.shape, y_train.shape)
print(X_cv_ppp_norm.shape, y_cv.shape)
print(X_test_ppp_norm.shape, y_test.shape)
print("="*100)
After vectorizations (24497, 1) (24497,) (12067, 1) (12067,) (18010, 1) (18010,) ====================================================================================================
#CREATING DATASET WITH BOW VECTORIZATION
from scipy.sparse import hstack
X_tr = hstack((X_train_essay_bow, X_train_state_ohe, X_train_teacher_ohe, X_train_grade_ohe,X_train_clean_cat_ohe,X_train_clean_sub_cat_ohe,X_train_ppp_norm,X_train_price_norm)).tocsr()
X_cr = hstack((X_cv_essay_bow, X_cv_state_ohe, X_cv_teacher_ohe, X_cv_grade_ohe, X_cv_clean_cat_ohe, X_cv_clean_sub_cat_ohe, X_cv_ppp_norm, X_cv_price_norm)).tocsr()
X_te = hstack((X_test_essay_bow, X_test_state_ohe, X_test_teacher_ohe, X_test_grade_ohe,X_test_clean_cat_ohe,X_test_clean_sub_cat_ohe,X_test_ppp_norm,X_test_price_norm)).tocsr()
print("Final Data matrix")
print(X_tr.shape, y_train.shape)
print(X_cr.shape, y_cv.shape)
print(X_te.shape, y_test.shape)
print("="*100)
Final Data matrix (24497, 5101) (24497,) (12067, 5101) (12067,) (18010, 5101) (18010,) ====================================================================================================
#CREATING DATASET WITH TFIDF VECTORIZATION
from scipy.sparse import hstack
X_tr_tfidf = hstack((X_train_essay_tfidf,X_train_state_ohe, X_train_teacher_ohe, X_train_grade_ohe,X_train_clean_cat_ohe,X_train_clean_sub_cat_ohe,X_train_ppp_norm,X_train_price_norm)).tocsr()
X_cr_tfidf = hstack((X_cv_essay_tfidf, X_cv_state_ohe, X_cv_teacher_ohe, X_cv_grade_ohe, X_cv_clean_cat_ohe, X_cv_clean_sub_cat_ohe, X_cv_ppp_norm, X_cv_price_norm)).tocsr()
X_te_tfidf = hstack((X_test_essay_tfidf,X_test_state_ohe, X_test_teacher_ohe, X_test_grade_ohe,X_test_clean_cat_ohe,X_test_clean_sub_cat_ohe,X_test_ppp_norm,X_test_price_norm)).tocsr()
print("Final Data matrix")
print(X_tr_tfidf.shape, y_train.shape)
print(X_cr_tfidf.shape, y_cv.shape)
print(X_te_tfidf.shape, y_test.shape)
print("="*100)
Final Data matrix (24497, 5101) (24497,) (12067, 5101) (12067,) (18010, 5101) (18010,) ====================================================================================================
#CREATING DATASET WITH TFIDF-W2V VECTORIZATION
X_tr_tfidfw2v = hstack((X_train_essay_tfidfw2v,X_train_state_ohe, X_train_teacher_ohe, X_train_grade_ohe,X_train_clean_cat_ohe,X_train_clean_sub_cat_ohe,X_train_ppp_norm,X_train_price_norm)).tocsr()
X_cr_tfidfw2v = hstack((X_cv_essay_tfidfw2v, X_cv_state_ohe, X_cv_teacher_ohe, X_cv_grade_ohe, X_cv_clean_cat_ohe, X_cv_clean_sub_cat_ohe, X_cv_ppp_norm, X_cv_price_norm)).tocsr()
X_te_tfidfw2v = hstack((X_test_essay_tfidfw2v,X_test_state_ohe, X_test_teacher_ohe, X_test_grade_ohe,X_test_clean_cat_ohe,X_test_clean_sub_cat_ohe,X_test_ppp_norm,X_test_price_norm)).tocsr()
print("Final Data matrix")
print(X_tr_tfidfw2v.shape, y_train.shape)
print(X_cr_tfidfw2v.shape, y_cv.shape)
print(X_te_tfidfw2v.shape, y_test.shape)
print("="*100)
Final Data matrix (24497, 201) (24497,) (12067, 201) (12067,) (18010, 201) (18010,) ====================================================================================================
def batch_predict(clf, data):
y_pred_vals = []
temp = data.shape[0]%1000
tr_loop = data.shape[0] - temp
for i in range(0, tr_loop, 1000):
y_pred_vals.extend(clf.predict_proba(data[i:i+1000])[:,1])
if data.shape[0]%1000 !=0:
y_pred_vals.extend(clf.predict_proba(data[tr_loop:])[:,1])
return y_pred_vals
#PERFORMING GRID SEARCH ON THE HYPERPARAMETERS
from sklearn.naive_bayes import MultinomialNB
train_auc = []
cv_auc = []
alpha = [0.00001,0.0001,0.001,0.01,0.1,1,10,100,1000,10000]
for i in tqdm(alpha):
clf = MultinomialNB(alpha=i)
clf.fit(X_tr, y_train)
y_train_pred = batch_predict(clf, X_tr) #here clearly multinomialNB is giving the probality scores.
y_cv_pred = batch_predict(clf, X_cr)
train_auc.append(roc_auc_score(y_train,y_train_pred))
cv_auc.append(roc_auc_score(y_cv, y_cv_pred))
plt.plot(alpha, train_auc, label='Train AUC')
plt.plot(alpha, cv_auc, label='CV AUC')
plt.scatter(alpha, train_auc, label='Train AUC points')
plt.scatter(alpha, cv_auc, label='CV AUC points')
plt.legend()
plt.xlabel("alpha: hyperparameter")
plt.ylabel("AUC")
plt.title("ERROR PLOTS")
plt.grid()
plt.show()
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:01<00:00, 8.84it/s]
BEST HYPERPARAMETER IS CONCLUDED FOR ALPHA AS "1" SINCE THE TRAINING AUC AND CV AUC ARE CLOSE TO EACH OTHER WITH HIGH ACCURACY
best_alpha = 1
from sklearn.metrics import roc_curve, auc
clf = MultinomialNB(alpha = best_alpha)
clf.fit(X_tr, y_train)
# roc_auc_score(y_true, y_score) the 2nd parameter should be probability estimates of the positive class
# not the predicted outputs
y_train_pred = batch_predict(clf, X_tr)
y_test_pred = batch_predict(clf, X_te)
train_fpr, train_tpr, tr_thresholds = roc_curve(y_train, y_train_pred)
test_fpr, test_tpr, te_thresholds = roc_curve(y_test, y_test_pred)
plt.plot(train_fpr, train_tpr, label="train AUC ="+str(auc(train_fpr, train_tpr)))
plt.plot(test_fpr, test_tpr, label="test AUC ="+str(auc(test_fpr, test_tpr)))
plt.legend()
plt.xlabel("alpha: hyperparameter")
plt.ylabel("AUC")
plt.title("ERROR PLOTS")
plt.grid()
plt.show()
TEST AUC IS 70%
MODEL DID GOOD FOR "BAG OF WORDS"
#PERFORMING GRID SEARCH ON THE HYPERPARAMETERS
from sklearn.naive_bayes import MultinomialNB
train_auc = []
cv_auc = []
alpha = [0.00001,0.0001,0.001,0.01,0.1,1,10,100,1000,10000]
for i in tqdm(alpha):
clf = MultinomialNB(alpha=i)
clf.fit(X_tr_tfidf, y_train)
y_train_pred = batch_predict(clf, X_tr_tfidf) #here clearly multinomialNB is giving the probality scores.
y_cv_pred = batch_predict(clf, X_cr_tfidf)
train_auc.append(roc_auc_score(y_train,y_train_pred))
cv_auc.append(roc_auc_score(y_cv, y_cv_pred))
plt.plot(alpha, train_auc, label='Train AUC')
plt.plot(alpha, cv_auc, label='CV AUC')
plt.scatter(alpha, train_auc, label='Train AUC points')
plt.scatter(alpha, cv_auc, label='CV AUC points')
plt.legend()
plt.xlabel("alpha: hyperparameter")
plt.ylabel("AUC")
plt.title("ERROR PLOTS")
plt.grid()
plt.show()
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:01<00:00, 7.55it/s]
BEST PARAMETER FOR ALPHA IS PREDICTED TO BE "1"
best_alpha = 1
from sklearn.metrics import roc_curve, auc
clf = MultinomialNB(alpha = best_alpha)
clf.fit(X_tr_tfidf, y_train)
# roc_auc_score(y_true, y_score) the 2nd parameter should be probability estimates of the positive class
# not the predicted outputs
y_train_pred = batch_predict(clf, X_tr_tfidf)
y_test_pred = batch_predict(clf, X_te_tfidf)
train_fpr, train_tpr, tr_thresholds = roc_curve(y_train, y_train_pred)
test_fpr, test_tpr, te_thresholds = roc_curve(y_test, y_test_pred)
plt.plot(train_fpr, train_tpr, label="train AUC ="+str(auc(train_fpr, train_tpr)))
plt.plot(test_fpr, test_tpr, label="test AUC ="+str(auc(test_fpr, test_tpr)))
plt.legend()
plt.xlabel("alpha: hyperparameter")
plt.ylabel("AUC")
plt.title("ERROR PLOTS")
plt.grid()
plt.show()
TEST ACCURACY FOR TESTING DATA FOR TF-IDF VECTORIZED DATA IS 67%
from sklearn.naive_bayes import MultinomialNB
train_auc = []
cv_auc = []
alpha = [0.00001,0.0001,0.001,0.01,0.1,1,10,100,1000,10000]
for i in tqdm(alpha):
clf = MultinomialNB(alpha=i)
clf.fit(X_tr_tfidfw2v, y_train)
y_train_pred = batch_predict(clf, X_tr_tfidfw2v) #here clearly multinomialNB is giving the probality scores.
y_cv_pred = batch_predict(clf, X_cr_tfidfw2v)
train_auc.append(roc_auc_score(y_train,y_train_pred))
cv_auc.append(roc_auc_score(y_cv, y_cv_pred))
plt.plot(alpha, train_auc, label='Train AUC')
plt.plot(alpha, cv_auc, label='CV AUC')
plt.scatter(alpha, train_auc, label='Train AUC points')
plt.scatter(alpha, cv_auc, label='CV AUC points')
plt.legend()
plt.xlabel("alpha: hyperparameter")
plt.ylabel("AUC")
plt.title("ERROR PLOTS")
plt.grid()
plt.show()
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 30.68it/s]
best_alpha = 1
from sklearn.metrics import roc_curve, auc
clf = MultinomialNB(alpha = best_alpha)
clf.fit(X_tr_tfidfw2v, y_train)
# roc_auc_score(y_true, y_score) the 2nd parameter should be probability estimates of the positive class
# not the predicted outputs
y_train_pred = batch_predict(clf, X_tr_tfidfw2v)
y_test_pred = batch_predict(clf, X_te_tfidfw2v)
train_fpr, train_tpr, tr_thresholds = roc_curve(y_train, y_train_pred)
test_fpr, test_tpr, te_thresholds = roc_curve(y_test, y_test_pred)
plt.plot(train_fpr, train_tpr, label="train AUC ="+str(auc(train_fpr, train_tpr)))
plt.plot(test_fpr, test_tpr, label="test AUC ="+str(auc(test_fpr, test_tpr)))
plt.legend()
plt.xlabel("alpha: hyperparameter")
plt.ylabel("AUC")
plt.title("ERROR PLOTS")
plt.grid()
plt.show()
TEST ACCURACY IS 58%
from prettytable import PrettyTable
x = PrettyTable()
x.field_names = ["Vectorizer","Model","Hyperparameter","AUC"]
x.add_row(["BOW","Naive Bayes","alpha","0.70"])
x.add_row(["TFIDF","Naive Bayes","alpha","0.67"])
x.add_row(["TFIDFW2V","Naive Bayes","alpha","0.58"])
print(x)
+------------+-------------+----------------+------+ | Vectorizer | Model | Hyperparameter | AUC | +------------+-------------+----------------+------+ | BOW | Naive Bayes | alpha | 0.70 | | TFIDF | Naive Bayes | alpha | 0.67 | | TFIDFW2V | Naive Bayes | alpha | 0.58 | +------------+-------------+----------------+------+
BEST ACCURACY IS OBTAINED FROM BOW FOR NAIVE BAYES WITH 68% ACCURACY
PLOTTING A CONFUSION PLOT
def find_best_threshold(threshould, fpr, tpr):
t = threshould[np.argmax(tpr*(1-fpr))]
print("the maximum value of tpr*(1-fpr)", max(tpr*(1-fpr)), "for threshold", np.round(t,3))
return t
def predict_with_best_t(proba, threshould):
predictions = []
for i in proba:
if i>=threshould:
predictions.append(1)
else:
predictions.append(0)
return predictions
CONFUSION MATRIX FOR THE BEST ACCURACY OF NAIVE BAYES
print("="*100)
from sklearn.metrics import confusion_matrix
best_t = find_best_threshold(tr_thresholds, train_fpr, train_tpr)
print("Test confusion matrix")
confu_mat = confusion_matrix(y_test, predict_with_best_t(y_test_pred, best_t))
sns.heatmap(confu_mat,annot = True)
==================================================================================================== the maximum value of tpr*(1-fpr) 0.4593571926702198 for threshold 0.569 Test confusion matrix
<AxesSubplot:>
from xgboost import XGBClassifier
from tqdm import tqdm
estimators = [5,10,50, 75, 100]
l_rate = [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3]
train_auc = []
cv_auc = []
for i in tqdm(estimators):
for j in tqdm(l_rate):
pred_list = []
pred_list_cv = []
clf = XGBClassifier(n_estimators = i,learning_rate = j,n_jobs = -1)
clf.fit(X_tr,y_train)
train_pred = clf.predict_proba(X_tr)
cv_pred = clf.predict_proba(X_cr)
train_auc.append(roc_auc_score(y_train,np.transpose(train_pred)[1]))
cv_auc.append(roc_auc_score(y_cv,np.transpose(cv_pred)[1]))
print("train_auc = ",train_auc,"\ncv_auc = ",cv_auc)
print("roc_auc_score train: ",roc_auc_score(y_train,np.transpose(train_pred)[1]))
print("roc_auc_score cv: ",roc_auc_score(y_cv,np.transpose(cv_pred)[1]))
0%| | 0/5 [00:00<?, ?it/s] 0%| | 0/6 [00:00<?, ?it/s] 17%|██████████████ | 1/6 [00:01<00:07, 1.58s/it]
train_auc = [0.6305308552567224] cv_auc = [0.6246045610612336] roc_auc_score train: 0.6305308552567224 roc_auc_score cv: 0.6246045610612336
33%|████████████████████████████ | 2/6 [00:01<00:03, 1.13it/s]
train_auc = [0.6305308552567224, 0.6329945187583427] cv_auc = [0.6246045610612336, 0.6241497514216621] roc_auc_score train: 0.6329945187583427 roc_auc_score cv: 0.6241497514216621
50%|██████████████████████████████████████████ | 3/6 [00:02<00:02, 1.49it/s]
train_auc = [0.6305308552567224, 0.6329945187583427, 0.6492002065326807] cv_auc = [0.6246045610612336, 0.6241497514216621, 0.6374883201048422] roc_auc_score train: 0.6492002065326807 roc_auc_score cv: 0.6374883201048422
67%|████████████████████████████████████████████████████████ | 4/6 [00:02<00:01, 1.76it/s]
train_auc = [0.6305308552567224, 0.6329945187583427, 0.6492002065326807, 0.6892792932448508] cv_auc = [0.6246045610612336, 0.6241497514216621, 0.6374883201048422, 0.6618528203496723] roc_auc_score train: 0.6892792932448508 roc_auc_score cv: 0.6618528203496723
83%|██████████████████████████████████████████████████████████████████████ | 5/6 [00:03<00:00, 1.90it/s]
train_auc = [0.6305308552567224, 0.6329945187583427, 0.6492002065326807, 0.6892792932448508, 0.7043416672188931] cv_auc = [0.6246045610612336, 0.6241497514216621, 0.6374883201048422, 0.6618528203496723, 0.6668381874428195] roc_auc_score train: 0.7043416672188931 roc_auc_score cv: 0.6668381874428195
100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:03<00:00, 1.61it/s] 20%|████████████████▊ | 1/5 [00:03<00:14, 3.73s/it]
train_auc = [0.6305308552567224, 0.6329945187583427, 0.6492002065326807, 0.6892792932448508, 0.7043416672188931, 0.7121949304036933] cv_auc = [0.6246045610612336, 0.6241497514216621, 0.6374883201048422, 0.6618528203496723, 0.6668381874428195, 0.6670094318871687] roc_auc_score train: 0.7121949304036933 roc_auc_score cv: 0.6670094318871687
0%| | 0/6 [00:00<?, ?it/s] 17%|██████████████ | 1/6 [00:00<00:03, 1.38it/s]
train_auc = [0.6305308552567224, 0.6329945187583427, 0.6492002065326807, 0.6892792932448508, 0.7043416672188931, 0.7121949304036933, 0.6303589629503407] cv_auc = [0.6246045610612336, 0.6241497514216621, 0.6374883201048422, 0.6618528203496723, 0.6668381874428195, 0.6670094318871687, 0.6246269927017845] roc_auc_score train: 0.6303589629503407 roc_auc_score cv: 0.6246269927017845
33%|████████████████████████████ | 2/6 [00:01<00:03, 1.10it/s]
train_auc = [0.6305308552567224, 0.6329945187583427, 0.6492002065326807, 0.6892792932448508, 0.7043416672188931, 0.7121949304036933, 0.6303589629503407, 0.6410713508089328] cv_auc = [0.6246045610612336, 0.6241497514216621, 0.6374883201048422, 0.6618528203496723, 0.6668381874428195, 0.6670094318871687, 0.6246269927017845, 0.6338548982347989] roc_auc_score train: 0.6410713508089328 roc_auc_score cv: 0.6338548982347989
50%|██████████████████████████████████████████ | 3/6 [00:02<00:02, 1.22it/s]
train_auc = [0.6305308552567224, 0.6329945187583427, 0.6492002065326807, 0.6892792932448508, 0.7043416672188931, 0.7121949304036933, 0.6303589629503407, 0.6410713508089328, 0.6672691378010028] cv_auc = [0.6246045610612336, 0.6241497514216621, 0.6374883201048422, 0.6618528203496723, 0.6668381874428195, 0.6670094318871687, 0.6246269927017845, 0.6338548982347989, 0.6502885781792413] roc_auc_score train: 0.6672691378010028 roc_auc_score cv: 0.6502885781792413
67%|████████████████████████████████████████████████████████ | 4/6 [00:03<00:01, 1.22it/s]
train_auc = [0.6305308552567224, 0.6329945187583427, 0.6492002065326807, 0.6892792932448508, 0.7043416672188931, 0.7121949304036933, 0.6303589629503407, 0.6410713508089328, 0.6672691378010028, 0.7084746362405687] cv_auc = [0.6246045610612336, 0.6241497514216621, 0.6374883201048422, 0.6618528203496723, 0.6668381874428195, 0.6670094318871687, 0.6246269927017845, 0.6338548982347989, 0.6502885781792413, 0.670089669995457] roc_auc_score train: 0.7084746362405687 roc_auc_score cv: 0.670089669995457
83%|██████████████████████████████████████████████████████████████████████ | 5/6 [00:04<00:00, 1.28it/s]
train_auc = [0.6305308552567224, 0.6329945187583427, 0.6492002065326807, 0.6892792932448508, 0.7043416672188931, 0.7121949304036933, 0.6303589629503407, 0.6410713508089328, 0.6672691378010028, 0.7084746362405687, 0.7355778181691959] cv_auc = [0.6246045610612336, 0.6241497514216621, 0.6374883201048422, 0.6618528203496723, 0.6668381874428195, 0.6670094318871687, 0.6246269927017845, 0.6338548982347989, 0.6502885781792413, 0.670089669995457, 0.6823510322878433] roc_auc_score train: 0.7355778181691959 roc_auc_score cv: 0.6823510322878433
100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00, 1.29it/s] 40%|█████████████████████████████████▌ | 2/5 [00:08<00:12, 4.28s/it]
train_auc = [0.6305308552567224, 0.6329945187583427, 0.6492002065326807, 0.6892792932448508, 0.7043416672188931, 0.7121949304036933, 0.6303589629503407, 0.6410713508089328, 0.6672691378010028, 0.7084746362405687, 0.7355778181691959, 0.7584528469638592] cv_auc = [0.6246045610612336, 0.6241497514216621, 0.6374883201048422, 0.6618528203496723, 0.6668381874428195, 0.6670094318871687, 0.6246269927017845, 0.6338548982347989, 0.6502885781792413, 0.670089669995457, 0.6823510322878433, 0.6856068548752831] roc_auc_score train: 0.7584528469638592 roc_auc_score cv: 0.6856068548752831
0%| | 0/6 [00:00<?, ?it/s] 17%|██████████████ | 1/6 [00:02<00:13, 2.74s/it]
train_auc = [0.6305308552567224, 0.6329945187583427, 0.6492002065326807, 0.6892792932448508, 0.7043416672188931, 0.7121949304036933, 0.6303589629503407, 0.6410713508089328, 0.6672691378010028, 0.7084746362405687, 0.7355778181691959, 0.7584528469638592, 0.6327612622633709] cv_auc = [0.6246045610612336, 0.6241497514216621, 0.6374883201048422, 0.6618528203496723, 0.6668381874428195, 0.6670094318871687, 0.6246269927017845, 0.6338548982347989, 0.6502885781792413, 0.670089669995457, 0.6823510322878433, 0.6856068548752831, 0.623983188363167] roc_auc_score train: 0.6327612622633709 roc_auc_score cv: 0.623983188363167
33%|████████████████████████████ | 2/6 [00:05<00:11, 2.84s/it]
train_auc = [0.6305308552567224, 0.6329945187583427, 0.6492002065326807, 0.6892792932448508, 0.7043416672188931, 0.7121949304036933, 0.6303589629503407, 0.6410713508089328, 0.6672691378010028, 0.7084746362405687, 0.7355778181691959, 0.7584528469638592, 0.6327612622633709, 0.6480588334138007] cv_auc = [0.6246045610612336, 0.6241497514216621, 0.6374883201048422, 0.6618528203496723, 0.6668381874428195, 0.6670094318871687, 0.6246269927017845, 0.6338548982347989, 0.6502885781792413, 0.670089669995457, 0.6823510322878433, 0.6856068548752831, 0.623983188363167, 0.638927846254966] roc_auc_score train: 0.6480588334138007 roc_auc_score cv: 0.638927846254966
50%|██████████████████████████████████████████ | 3/6 [00:08<00:08, 2.90s/it]
train_auc = [0.6305308552567224, 0.6329945187583427, 0.6492002065326807, 0.6892792932448508, 0.7043416672188931, 0.7121949304036933, 0.6303589629503407, 0.6410713508089328, 0.6672691378010028, 0.7084746362405687, 0.7355778181691959, 0.7584528469638592, 0.6327612622633709, 0.6480588334138007, 0.6917145368082576] cv_auc = [0.6246045610612336, 0.6241497514216621, 0.6374883201048422, 0.6618528203496723, 0.6668381874428195, 0.6670094318871687, 0.6246269927017845, 0.6338548982347989, 0.6502885781792413, 0.670089669995457, 0.6823510322878433, 0.6856068548752831, 0.623983188363167, 0.638927846254966, 0.6636920035622746] roc_auc_score train: 0.6917145368082576 roc_auc_score cv: 0.6636920035622746
67%|████████████████████████████████████████████████████████ | 4/6 [00:11<00:05, 2.89s/it]
train_auc = [0.6305308552567224, 0.6329945187583427, 0.6492002065326807, 0.6892792932448508, 0.7043416672188931, 0.7121949304036933, 0.6303589629503407, 0.6410713508089328, 0.6672691378010028, 0.7084746362405687, 0.7355778181691959, 0.7584528469638592, 0.6327612622633709, 0.6480588334138007, 0.6917145368082576, 0.8164735175876225] cv_auc = [0.6246045610612336, 0.6241497514216621, 0.6374883201048422, 0.6618528203496723, 0.6668381874428195, 0.6670094318871687, 0.6246269927017845, 0.6338548982347989, 0.6502885781792413, 0.670089669995457, 0.6823510322878433, 0.6856068548752831, 0.623983188363167, 0.638927846254966, 0.6636920035622746, 0.7098033291285386] roc_auc_score train: 0.8164735175876225 roc_auc_score cv: 0.7098033291285386
83%|██████████████████████████████████████████████████████████████████████ | 5/6 [00:14<00:02, 2.86s/it]
train_auc = [0.6305308552567224, 0.6329945187583427, 0.6492002065326807, 0.6892792932448508, 0.7043416672188931, 0.7121949304036933, 0.6303589629503407, 0.6410713508089328, 0.6672691378010028, 0.7084746362405687, 0.7355778181691959, 0.7584528469638592, 0.6327612622633709, 0.6480588334138007, 0.6917145368082576, 0.8164735175876225, 0.8627140074597901] cv_auc = [0.6246045610612336, 0.6241497514216621, 0.6374883201048422, 0.6618528203496723, 0.6668381874428195, 0.6670094318871687, 0.6246269927017845, 0.6338548982347989, 0.6502885781792413, 0.670089669995457, 0.6823510322878433, 0.6856068548752831, 0.623983188363167, 0.638927846254966, 0.6636920035622746, 0.7098033291285386, 0.7159148458513916] roc_auc_score train: 0.8627140074597901 roc_auc_score cv: 0.7159148458513916
100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:17<00:00, 2.87s/it] 60%|██████████████████████████████████████████████████▍ | 3/5 [00:25<00:20, 10.18s/it]
train_auc = [0.6305308552567224, 0.6329945187583427, 0.6492002065326807, 0.6892792932448508, 0.7043416672188931, 0.7121949304036933, 0.6303589629503407, 0.6410713508089328, 0.6672691378010028, 0.7084746362405687, 0.7355778181691959, 0.7584528469638592, 0.6327612622633709, 0.6480588334138007, 0.6917145368082576, 0.8164735175876225, 0.8627140074597901, 0.8965112267631012] cv_auc = [0.6246045610612336, 0.6241497514216621, 0.6374883201048422, 0.6618528203496723, 0.6668381874428195, 0.6670094318871687, 0.6246269927017845, 0.6338548982347989, 0.6502885781792413, 0.670089669995457, 0.6823510322878433, 0.6856068548752831, 0.623983188363167, 0.638927846254966, 0.6636920035622746, 0.7098033291285386, 0.7159148458513916, 0.7105087717142344] roc_auc_score train: 0.8965112267631012 roc_auc_score cv: 0.7105087717142344
0%| | 0/6 [00:00<?, ?it/s] 17%|██████████████ | 1/6 [00:04<00:21, 4.25s/it]
train_auc = [0.6305308552567224, 0.6329945187583427, 0.6492002065326807, 0.6892792932448508, 0.7043416672188931, 0.7121949304036933, 0.6303589629503407, 0.6410713508089328, 0.6672691378010028, 0.7084746362405687, 0.7355778181691959, 0.7584528469638592, 0.6327612622633709, 0.6480588334138007, 0.6917145368082576, 0.8164735175876225, 0.8627140074597901, 0.8965112267631012, 0.6409815706266035] cv_auc = [0.6246045610612336, 0.6241497514216621, 0.6374883201048422, 0.6618528203496723, 0.6668381874428195, 0.6670094318871687, 0.6246269927017845, 0.6338548982347989, 0.6502885781792413, 0.670089669995457, 0.6823510322878433, 0.6856068548752831, 0.623983188363167, 0.638927846254966, 0.6636920035622746, 0.7098033291285386, 0.7159148458513916, 0.7105087717142344, 0.6330548851529262] roc_auc_score train: 0.6409815706266035 roc_auc_score cv: 0.6330548851529262
33%|████████████████████████████ | 2/6 [00:08<00:17, 4.27s/it]
train_auc = [0.6305308552567224, 0.6329945187583427, 0.6492002065326807, 0.6892792932448508, 0.7043416672188931, 0.7121949304036933, 0.6303589629503407, 0.6410713508089328, 0.6672691378010028, 0.7084746362405687, 0.7355778181691959, 0.7584528469638592, 0.6327612622633709, 0.6480588334138007, 0.6917145368082576, 0.8164735175876225, 0.8627140074597901, 0.8965112267631012, 0.6409815706266035, 0.6650909201354375] cv_auc = [0.6246045610612336, 0.6241497514216621, 0.6374883201048422, 0.6618528203496723, 0.6668381874428195, 0.6670094318871687, 0.6246269927017845, 0.6338548982347989, 0.6502885781792413, 0.670089669995457, 0.6823510322878433, 0.6856068548752831, 0.623983188363167, 0.638927846254966, 0.6636920035622746, 0.7098033291285386, 0.7159148458513916, 0.7105087717142344, 0.6330548851529262, 0.6505155441191033] roc_auc_score train: 0.6650909201354375 roc_auc_score cv: 0.6505155441191033
50%|██████████████████████████████████████████ | 3/6 [00:13<00:13, 4.57s/it]
train_auc = [0.6305308552567224, 0.6329945187583427, 0.6492002065326807, 0.6892792932448508, 0.7043416672188931, 0.7121949304036933, 0.6303589629503407, 0.6410713508089328, 0.6672691378010028, 0.7084746362405687, 0.7355778181691959, 0.7584528469638592, 0.6327612622633709, 0.6480588334138007, 0.6917145368082576, 0.8164735175876225, 0.8627140074597901, 0.8965112267631012, 0.6409815706266035, 0.6650909201354375, 0.7028190290912878] cv_auc = [0.6246045610612336, 0.6241497514216621, 0.6374883201048422, 0.6618528203496723, 0.6668381874428195, 0.6670094318871687, 0.6246269927017845, 0.6338548982347989, 0.6502885781792413, 0.670089669995457, 0.6823510322878433, 0.6856068548752831, 0.623983188363167, 0.638927846254966, 0.6636920035622746, 0.7098033291285386, 0.7159148458513916, 0.7105087717142344, 0.6330548851529262, 0.6505155441191033, 0.6682110038445231] roc_auc_score train: 0.7028190290912878 roc_auc_score cv: 0.6682110038445231
67%|████████████████████████████████████████████████████████ | 4/6 [00:17<00:08, 4.49s/it]
train_auc = [0.6305308552567224, 0.6329945187583427, 0.6492002065326807, 0.6892792932448508, 0.7043416672188931, 0.7121949304036933, 0.6303589629503407, 0.6410713508089328, 0.6672691378010028, 0.7084746362405687, 0.7355778181691959, 0.7584528469638592, 0.6327612622633709, 0.6480588334138007, 0.6917145368082576, 0.8164735175876225, 0.8627140074597901, 0.8965112267631012, 0.6409815706266035, 0.6650909201354375, 0.7028190290912878, 0.8469069522150591] cv_auc = [0.6246045610612336, 0.6241497514216621, 0.6374883201048422, 0.6618528203496723, 0.6668381874428195, 0.6670094318871687, 0.6246269927017845, 0.6338548982347989, 0.6502885781792413, 0.670089669995457, 0.6823510322878433, 0.6856068548752831, 0.623983188363167, 0.638927846254966, 0.6636920035622746, 0.7098033291285386, 0.7159148458513916, 0.7105087717142344, 0.6330548851529262, 0.6505155441191033, 0.6682110038445231, 0.7171994473884076] roc_auc_score train: 0.8469069522150591 roc_auc_score cv: 0.7171994473884076
83%|██████████████████████████████████████████████████████████████████████ | 5/6 [00:22<00:04, 4.41s/it]
train_auc = [0.6305308552567224, 0.6329945187583427, 0.6492002065326807, 0.6892792932448508, 0.7043416672188931, 0.7121949304036933, 0.6303589629503407, 0.6410713508089328, 0.6672691378010028, 0.7084746362405687, 0.7355778181691959, 0.7584528469638592, 0.6327612622633709, 0.6480588334138007, 0.6917145368082576, 0.8164735175876225, 0.8627140074597901, 0.8965112267631012, 0.6409815706266035, 0.6650909201354375, 0.7028190290912878, 0.8469069522150591, 0.9034203129388229] cv_auc = [0.6246045610612336, 0.6241497514216621, 0.6374883201048422, 0.6618528203496723, 0.6668381874428195, 0.6670094318871687, 0.6246269927017845, 0.6338548982347989, 0.6502885781792413, 0.670089669995457, 0.6823510322878433, 0.6856068548752831, 0.623983188363167, 0.638927846254966, 0.6636920035622746, 0.7098033291285386, 0.7159148458513916, 0.7105087717142344, 0.6330548851529262, 0.6505155441191033, 0.6682110038445231, 0.7171994473884076, 0.7196363362928144] roc_auc_score train: 0.9034203129388229 roc_auc_score cv: 0.7196363362928144
100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:26<00:00, 4.40s/it] 80%|███████████████████████████████████████████████████████████████████▏ | 4/5 [00:52<00:16, 16.60s/it]
train_auc = [0.6305308552567224, 0.6329945187583427, 0.6492002065326807, 0.6892792932448508, 0.7043416672188931, 0.7121949304036933, 0.6303589629503407, 0.6410713508089328, 0.6672691378010028, 0.7084746362405687, 0.7355778181691959, 0.7584528469638592, 0.6327612622633709, 0.6480588334138007, 0.6917145368082576, 0.8164735175876225, 0.8627140074597901, 0.8965112267631012, 0.6409815706266035, 0.6650909201354375, 0.7028190290912878, 0.8469069522150591, 0.9034203129388229, 0.9289626111158444] cv_auc = [0.6246045610612336, 0.6241497514216621, 0.6374883201048422, 0.6618528203496723, 0.6668381874428195, 0.6670094318871687, 0.6246269927017845, 0.6338548982347989, 0.6502885781792413, 0.670089669995457, 0.6823510322878433, 0.6856068548752831, 0.623983188363167, 0.638927846254966, 0.6636920035622746, 0.7098033291285386, 0.7159148458513916, 0.7105087717142344, 0.6330548851529262, 0.6505155441191033, 0.6682110038445231, 0.7171994473884076, 0.7196363362928144, 0.7156785659042575] roc_auc_score train: 0.9289626111158444 roc_auc_score cv: 0.7156785659042575
0%| | 0/6 [00:00<?, ?it/s] 17%|██████████████ | 1/6 [00:05<00:29, 5.82s/it]
train_auc = [0.6305308552567224, 0.6329945187583427, 0.6492002065326807, 0.6892792932448508, 0.7043416672188931, 0.7121949304036933, 0.6303589629503407, 0.6410713508089328, 0.6672691378010028, 0.7084746362405687, 0.7355778181691959, 0.7584528469638592, 0.6327612622633709, 0.6480588334138007, 0.6917145368082576, 0.8164735175876225, 0.8627140074597901, 0.8965112267631012, 0.6409815706266035, 0.6650909201354375, 0.7028190290912878, 0.8469069522150591, 0.9034203129388229, 0.9289626111158444, 0.640986627442813] cv_auc = [0.6246045610612336, 0.6241497514216621, 0.6374883201048422, 0.6618528203496723, 0.6668381874428195, 0.6670094318871687, 0.6246269927017845, 0.6338548982347989, 0.6502885781792413, 0.670089669995457, 0.6823510322878433, 0.6856068548752831, 0.623983188363167, 0.638927846254966, 0.6636920035622746, 0.7098033291285386, 0.7159148458513916, 0.7105087717142344, 0.6330548851529262, 0.6505155441191033, 0.6682110038445231, 0.7171994473884076, 0.7196363362928144, 0.7156785659042575, 0.6331739679055889] roc_auc_score train: 0.640986627442813 roc_auc_score cv: 0.6331739679055889
33%|████████████████████████████ | 2/6 [00:11<00:23, 5.92s/it]
train_auc = [0.6305308552567224, 0.6329945187583427, 0.6492002065326807, 0.6892792932448508, 0.7043416672188931, 0.7121949304036933, 0.6303589629503407, 0.6410713508089328, 0.6672691378010028, 0.7084746362405687, 0.7355778181691959, 0.7584528469638592, 0.6327612622633709, 0.6480588334138007, 0.6917145368082576, 0.8164735175876225, 0.8627140074597901, 0.8965112267631012, 0.6409815706266035, 0.6650909201354375, 0.7028190290912878, 0.8469069522150591, 0.9034203129388229, 0.9289626111158444, 0.640986627442813, 0.667643799759545] cv_auc = [0.6246045610612336, 0.6241497514216621, 0.6374883201048422, 0.6618528203496723, 0.6668381874428195, 0.6670094318871687, 0.6246269927017845, 0.6338548982347989, 0.6502885781792413, 0.670089669995457, 0.6823510322878433, 0.6856068548752831, 0.623983188363167, 0.638927846254966, 0.6636920035622746, 0.7098033291285386, 0.7159148458513916, 0.7105087717142344, 0.6330548851529262, 0.6505155441191033, 0.6682110038445231, 0.7171994473884076, 0.7196363362928144, 0.7156785659042575, 0.6331739679055889, 0.6505545394130752] roc_auc_score train: 0.667643799759545 roc_auc_score cv: 0.6505545394130752
50%|██████████████████████████████████████████ | 3/6 [00:18<00:18, 6.16s/it]
train_auc = [0.6305308552567224, 0.6329945187583427, 0.6492002065326807, 0.6892792932448508, 0.7043416672188931, 0.7121949304036933, 0.6303589629503407, 0.6410713508089328, 0.6672691378010028, 0.7084746362405687, 0.7355778181691959, 0.7584528469638592, 0.6327612622633709, 0.6480588334138007, 0.6917145368082576, 0.8164735175876225, 0.8627140074597901, 0.8965112267631012, 0.6409815706266035, 0.6650909201354375, 0.7028190290912878, 0.8469069522150591, 0.9034203129388229, 0.9289626111158444, 0.640986627442813, 0.667643799759545, 0.7120918415646625] cv_auc = [0.6246045610612336, 0.6241497514216621, 0.6374883201048422, 0.6618528203496723, 0.6668381874428195, 0.6670094318871687, 0.6246269927017845, 0.6338548982347989, 0.6502885781792413, 0.670089669995457, 0.6823510322878433, 0.6856068548752831, 0.623983188363167, 0.638927846254966, 0.6636920035622746, 0.7098033291285386, 0.7159148458513916, 0.7105087717142344, 0.6330548851529262, 0.6505155441191033, 0.6682110038445231, 0.7171994473884076, 0.7196363362928144, 0.7156785659042575, 0.6331739679055889, 0.6505545394130752, 0.6724380514485606] roc_auc_score train: 0.7120918415646625 roc_auc_score cv: 0.6724380514485606
67%|████████████████████████████████████████████████████████ | 4/6 [00:24<00:12, 6.04s/it]
train_auc = [0.6305308552567224, 0.6329945187583427, 0.6492002065326807, 0.6892792932448508, 0.7043416672188931, 0.7121949304036933, 0.6303589629503407, 0.6410713508089328, 0.6672691378010028, 0.7084746362405687, 0.7355778181691959, 0.7584528469638592, 0.6327612622633709, 0.6480588334138007, 0.6917145368082576, 0.8164735175876225, 0.8627140074597901, 0.8965112267631012, 0.6409815706266035, 0.6650909201354375, 0.7028190290912878, 0.8469069522150591, 0.9034203129388229, 0.9289626111158444, 0.640986627442813, 0.667643799759545, 0.7120918415646625, 0.8730423494551892] cv_auc = [0.6246045610612336, 0.6241497514216621, 0.6374883201048422, 0.6618528203496723, 0.6668381874428195, 0.6670094318871687, 0.6246269927017845, 0.6338548982347989, 0.6502885781792413, 0.670089669995457, 0.6823510322878433, 0.6856068548752831, 0.623983188363167, 0.638927846254966, 0.6636920035622746, 0.7098033291285386, 0.7159148458513916, 0.7105087717142344, 0.6330548851529262, 0.6505155441191033, 0.6682110038445231, 0.7171994473884076, 0.7196363362928144, 0.7156785659042575, 0.6331739679055889, 0.6505545394130752, 0.6724380514485606, 0.7206987345498825] roc_auc_score train: 0.8730423494551892 roc_auc_score cv: 0.7206987345498825
83%|██████████████████████████████████████████████████████████████████████ | 5/6 [00:29<00:05, 5.95s/it]
train_auc = [0.6305308552567224, 0.6329945187583427, 0.6492002065326807, 0.6892792932448508, 0.7043416672188931, 0.7121949304036933, 0.6303589629503407, 0.6410713508089328, 0.6672691378010028, 0.7084746362405687, 0.7355778181691959, 0.7584528469638592, 0.6327612622633709, 0.6480588334138007, 0.6917145368082576, 0.8164735175876225, 0.8627140074597901, 0.8965112267631012, 0.6409815706266035, 0.6650909201354375, 0.7028190290912878, 0.8469069522150591, 0.9034203129388229, 0.9289626111158444, 0.640986627442813, 0.667643799759545, 0.7120918415646625, 0.8730423494551892, 0.9250976217974811] cv_auc = [0.6246045610612336, 0.6241497514216621, 0.6374883201048422, 0.6618528203496723, 0.6668381874428195, 0.6670094318871687, 0.6246269927017845, 0.6338548982347989, 0.6502885781792413, 0.670089669995457, 0.6823510322878433, 0.6856068548752831, 0.623983188363167, 0.638927846254966, 0.6636920035622746, 0.7098033291285386, 0.7159148458513916, 0.7105087717142344, 0.6330548851529262, 0.6505155441191033, 0.6682110038445231, 0.7171994473884076, 0.7196363362928144, 0.7156785659042575, 0.6331739679055889, 0.6505545394130752, 0.6724380514485606, 0.7206987345498825, 0.7208788053571439] roc_auc_score train: 0.9250976217974811 roc_auc_score cv: 0.7208788053571439
100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:35<00:00, 5.96s/it] 100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [01:27<00:00, 17.56s/it]
train_auc = [0.6305308552567224, 0.6329945187583427, 0.6492002065326807, 0.6892792932448508, 0.7043416672188931, 0.7121949304036933, 0.6303589629503407, 0.6410713508089328, 0.6672691378010028, 0.7084746362405687, 0.7355778181691959, 0.7584528469638592, 0.6327612622633709, 0.6480588334138007, 0.6917145368082576, 0.8164735175876225, 0.8627140074597901, 0.8965112267631012, 0.6409815706266035, 0.6650909201354375, 0.7028190290912878, 0.8469069522150591, 0.9034203129388229, 0.9289626111158444, 0.640986627442813, 0.667643799759545, 0.7120918415646625, 0.8730423494551892, 0.9250976217974811, 0.9522003106668053] cv_auc = [0.6246045610612336, 0.6241497514216621, 0.6374883201048422, 0.6618528203496723, 0.6668381874428195, 0.6670094318871687, 0.6246269927017845, 0.6338548982347989, 0.6502885781792413, 0.670089669995457, 0.6823510322878433, 0.6856068548752831, 0.623983188363167, 0.638927846254966, 0.6636920035622746, 0.7098033291285386, 0.7159148458513916, 0.7105087717142344, 0.6330548851529262, 0.6505155441191033, 0.6682110038445231, 0.7171994473884076, 0.7196363362928144, 0.7156785659042575, 0.6331739679055889, 0.6505545394130752, 0.6724380514485606, 0.7206987345498825, 0.7208788053571439, 0.7151332494715072] roc_auc_score train: 0.9522003106668053 roc_auc_score cv: 0.7151332494715072
from sklearn.metrics import roc_curve, auc
from xgboost import XGBClassifier
estimators = 100
l_rate = 0.01
clf = XGBClassifier(n_estimators = estimators,learning_rate = l_rate)
clf.fit(X_tr,y_train)
train_pred = clf.predict_proba(X_tr)
test_pred = clf.predict_proba(X_te)
train_fpr, train_tpr, tr_thresholds = roc_curve(y_train,np.transpose(train_pred)[1])
test_fpr, test_tpr, te_thresholds = roc_curve(y_test,np.transpose(test_pred)[1])
plt.plot(train_fpr, train_tpr, label="train AUC ="+str(auc(train_fpr, train_tpr)))
plt.plot(test_fpr, test_tpr, label="test AUC ="+str(auc(test_fpr, test_tpr)))
plt.legend()
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("ERROR PLOTS")
plt.grid()
plt.show()
TEST ACCURACY OBTAINED FOR BAG OF WORDS DATA IS 66%
from xgboost import XGBClassifier
from tqdm import tqdm
estimators = [5,10,50, 75, 100]
l_rate = [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3]
train_auc = []
cv_auc = []
for i in tqdm(estimators):
for j in tqdm(l_rate):
pred_list = []
pred_list_cv = []
clf = XGBClassifier(n_estimators = i,learning_rate = j,n_jobs = -1)
clf.fit(X_tr_tfidf,y_train)
train_pred = clf.predict_proba(X_tr_tfidf)
cv_pred = clf.predict_proba(X_cr_tfidf)
train_auc.append(roc_auc_score(y_train,np.transpose(train_pred)[1]))
cv_auc.append(roc_auc_score(y_cv,np.transpose(cv_pred)[1]))
print("train_auc = ",train_auc,"\ncv_auc = ",cv_auc)
print("roc_auc_score train: ",roc_auc_score(y_train,np.transpose(train_pred)[1]))
print("roc_auc_score cv: ",roc_auc_score(y_cv,np.transpose(cv_pred)[1]))
0%| | 0/5 [00:00<?, ?it/s] 0%| | 0/6 [00:00<?, ?it/s] 17%|██████████████ | 1/6 [00:01<00:08, 1.78s/it]
train_auc = [0.652193415725515] cv_auc = [0.6242118610582885] roc_auc_score train: 0.652193415725515 roc_auc_score cv: 0.6242118610582885
33%|████████████████████████████ | 2/6 [00:03<00:07, 1.77s/it]
train_auc = [0.652193415725515, 0.6520705216804092] cv_auc = [0.6242118610582885, 0.6240723460069502] roc_auc_score train: 0.6520705216804092 roc_auc_score cv: 0.6240723460069502
50%|██████████████████████████████████████████ | 3/6 [00:05<00:05, 1.79s/it]
train_auc = [0.652193415725515, 0.6520705216804092, 0.6543583760125465] cv_auc = [0.6242118610582885, 0.6240723460069502, 0.628059375057095] roc_auc_score train: 0.6543583760125465 roc_auc_score cv: 0.628059375057095
67%|████████████████████████████████████████████████████████ | 4/6 [00:07<00:03, 1.81s/it]
train_auc = [0.652193415725515, 0.6520705216804092, 0.6543583760125465, 0.6999534315349686] cv_auc = [0.6242118610582885, 0.6240723460069502, 0.628059375057095, 0.6531943371355939] roc_auc_score train: 0.6999534315349686 roc_auc_score cv: 0.6531943371355939
83%|██████████████████████████████████████████████████████████████████████ | 5/6 [00:09<00:01, 1.81s/it]
train_auc = [0.652193415725515, 0.6520705216804092, 0.6543583760125465, 0.6999534315349686, 0.7203369488385889] cv_auc = [0.6242118610582885, 0.6240723460069502, 0.628059375057095, 0.6531943371355939, 0.659141907824683] roc_auc_score train: 0.7203369488385889 roc_auc_score cv: 0.659141907824683
100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:10<00:00, 1.83s/it] 20%|████████████████▊ | 1/5 [00:10<00:43, 10.96s/it]
train_auc = [0.652193415725515, 0.6520705216804092, 0.6543583760125465, 0.6999534315349686, 0.7203369488385889, 0.7240553220426698] cv_auc = [0.6242118610582885, 0.6240723460069502, 0.628059375057095, 0.6531943371355939, 0.659141907824683, 0.6683784446628658] roc_auc_score train: 0.7240553220426698 roc_auc_score cv: 0.6683784446628658
0%| | 0/6 [00:00<?, ?it/s] 17%|██████████████ | 1/6 [00:04<00:20, 4.18s/it]
train_auc = [0.652193415725515, 0.6520705216804092, 0.6543583760125465, 0.6999534315349686, 0.7203369488385889, 0.7240553220426698, 0.6521386190868382] cv_auc = [0.6242118610582885, 0.6240723460069502, 0.628059375057095, 0.6531943371355939, 0.659141907824683, 0.6683784446628658, 0.6241619262758451] roc_auc_score train: 0.6521386190868382 roc_auc_score cv: 0.6241619262758451
33%|████████████████████████████ | 2/6 [00:08<00:15, 3.99s/it]
train_auc = [0.652193415725515, 0.6520705216804092, 0.6543583760125465, 0.6999534315349686, 0.7203369488385889, 0.7240553220426698, 0.6521386190868382, 0.6521184588780653] cv_auc = [0.6242118610582885, 0.6240723460069502, 0.628059375057095, 0.6531943371355939, 0.659141907824683, 0.6683784446628658, 0.6241619262758451, 0.6242108370051328] roc_auc_score train: 0.6521184588780653 roc_auc_score cv: 0.6242108370051328
50%|██████████████████████████████████████████ | 3/6 [00:11<00:11, 3.93s/it]
train_auc = [0.652193415725515, 0.6520705216804092, 0.6543583760125465, 0.6999534315349686, 0.7203369488385889, 0.7240553220426698, 0.6521386190868382, 0.6521184588780653, 0.6715928694946971] cv_auc = [0.6242118610582885, 0.6240723460069502, 0.628059375057095, 0.6531943371355939, 0.659141907824683, 0.6683784446628658, 0.6241619262758451, 0.6242108370051328, 0.6415291850923088] roc_auc_score train: 0.6715928694946971 roc_auc_score cv: 0.6415291850923088
67%|████████████████████████████████████████████████████████ | 4/6 [00:16<00:08, 4.14s/it]
train_auc = [0.652193415725515, 0.6520705216804092, 0.6543583760125465, 0.6999534315349686, 0.7203369488385889, 0.7240553220426698, 0.6521386190868382, 0.6521184588780653, 0.6715928694946971, 0.7268363737342182] cv_auc = [0.6242118610582885, 0.6240723460069502, 0.628059375057095, 0.6531943371355939, 0.659141907824683, 0.6683784446628658, 0.6241619262758451, 0.6242108370051328, 0.6415291850923088, 0.6676599494631392] roc_auc_score train: 0.7268363737342182 roc_auc_score cv: 0.6676599494631392
83%|██████████████████████████████████████████████████████████████████████ | 5/6 [00:19<00:03, 3.96s/it]
train_auc = [0.652193415725515, 0.6520705216804092, 0.6543583760125465, 0.6999534315349686, 0.7203369488385889, 0.7240553220426698, 0.6521386190868382, 0.6521184588780653, 0.6715928694946971, 0.7268363737342182, 0.75862645117214] cv_auc = [0.6242118610582885, 0.6240723460069502, 0.628059375057095, 0.6531943371355939, 0.659141907824683, 0.6683784446628658, 0.6241619262758451, 0.6242108370051328, 0.6415291850923088, 0.6676599494631392, 0.6758782686232518] roc_auc_score train: 0.75862645117214 roc_auc_score cv: 0.6758782686232518
100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:23<00:00, 3.97s/it] 40%|█████████████████████████████████▌ | 2/5 [00:34<00:55, 18.52s/it]
train_auc = [0.652193415725515, 0.6520705216804092, 0.6543583760125465, 0.6999534315349686, 0.7203369488385889, 0.7240553220426698, 0.6521386190868382, 0.6521184588780653, 0.6715928694946971, 0.7268363737342182, 0.75862645117214, 0.7728523165109387] cv_auc = [0.6242118610582885, 0.6240723460069502, 0.628059375057095, 0.6531943371355939, 0.659141907824683, 0.6683784446628658, 0.6241619262758451, 0.6242108370051328, 0.6415291850923088, 0.6676599494631392, 0.6758782686232518, 0.6834200462533926] roc_auc_score train: 0.7728523165109387 roc_auc_score cv: 0.6834200462533926
0%| | 0/6 [00:00<?, ?it/s] 17%|██████████████ | 1/6 [00:17<01:28, 17.80s/it]
train_auc = [0.652193415725515, 0.6520705216804092, 0.6543583760125465, 0.6999534315349686, 0.7203369488385889, 0.7240553220426698, 0.6521386190868382, 0.6521184588780653, 0.6715928694946971, 0.7268363737342182, 0.75862645117214, 0.7728523165109387, 0.6520426815799355] cv_auc = [0.6242118610582885, 0.6240723460069502, 0.628059375057095, 0.6531943371355939, 0.659141907824683, 0.6683784446628658, 0.6241619262758451, 0.6242108370051328, 0.6415291850923088, 0.6676599494631392, 0.6758782686232518, 0.6834200462533926, 0.6240347973912457] roc_auc_score train: 0.6520426815799355 roc_auc_score cv: 0.6240347973912457
33%|████████████████████████████ | 2/6 [00:36<01:12, 18.17s/it]
train_auc = [0.652193415725515, 0.6520705216804092, 0.6543583760125465, 0.6999534315349686, 0.7203369488385889, 0.7240553220426698, 0.6521386190868382, 0.6521184588780653, 0.6715928694946971, 0.7268363737342182, 0.75862645117214, 0.7728523165109387, 0.6520426815799355, 0.6546536435899135] cv_auc = [0.6242118610582885, 0.6240723460069502, 0.628059375057095, 0.6531943371355939, 0.659141907824683, 0.6683784446628658, 0.6241619262758451, 0.6242108370051328, 0.6415291850923088, 0.6676599494631392, 0.6758782686232518, 0.6834200462533926, 0.6240347973912457, 0.6289084451615634] roc_auc_score train: 0.6546536435899135 roc_auc_score cv: 0.6289084451615634
50%|██████████████████████████████████████████ | 3/6 [00:54<00:55, 18.40s/it]
train_auc = [0.652193415725515, 0.6520705216804092, 0.6543583760125465, 0.6999534315349686, 0.7203369488385889, 0.7240553220426698, 0.6521386190868382, 0.6521184588780653, 0.6715928694946971, 0.7268363737342182, 0.75862645117214, 0.7728523165109387, 0.6520426815799355, 0.6546536435899135, 0.7063109894635199] cv_auc = [0.6242118610582885, 0.6240723460069502, 0.628059375057095, 0.6531943371355939, 0.659141907824683, 0.6683784446628658, 0.6241619262758451, 0.6242108370051328, 0.6415291850923088, 0.6676599494631392, 0.6758782686232518, 0.6834200462533926, 0.6240347973912457, 0.6289084451615634, 0.657325806445111] roc_auc_score train: 0.7063109894635199 roc_auc_score cv: 0.657325806445111
67%|████████████████████████████████████████████████████████ | 4/6 [01:12<00:36, 18.21s/it]
train_auc = [0.652193415725515, 0.6520705216804092, 0.6543583760125465, 0.6999534315349686, 0.7203369488385889, 0.7240553220426698, 0.6521386190868382, 0.6521184588780653, 0.6715928694946971, 0.7268363737342182, 0.75862645117214, 0.7728523165109387, 0.6520426815799355, 0.6546536435899135, 0.7063109894635199, 0.8382894037216906] cv_auc = [0.6242118610582885, 0.6240723460069502, 0.628059375057095, 0.6531943371355939, 0.659141907824683, 0.6683784446628658, 0.6241619262758451, 0.6242108370051328, 0.6415291850923088, 0.6676599494631392, 0.6758782686232518, 0.6834200462533926, 0.6240347973912457, 0.6289084451615634, 0.657325806445111, 0.7071402220056214] roc_auc_score train: 0.8382894037216906 roc_auc_score cv: 0.7071402220056214
83%|██████████████████████████████████████████████████████████████████████ | 5/6 [01:31<00:18, 18.23s/it]
train_auc = [0.652193415725515, 0.6520705216804092, 0.6543583760125465, 0.6999534315349686, 0.7203369488385889, 0.7240553220426698, 0.6521386190868382, 0.6521184588780653, 0.6715928694946971, 0.7268363737342182, 0.75862645117214, 0.7728523165109387, 0.6520426815799355, 0.6546536435899135, 0.7063109894635199, 0.8382894037216906, 0.8879065333118754] cv_auc = [0.6242118610582885, 0.6240723460069502, 0.628059375057095, 0.6531943371355939, 0.659141907824683, 0.6683784446628658, 0.6241619262758451, 0.6242108370051328, 0.6415291850923088, 0.6676599494631392, 0.6758782686232518, 0.6834200462533926, 0.6240347973912457, 0.6289084451615634, 0.657325806445111, 0.7071402220056214, 0.7072724549011867] roc_auc_score train: 0.8879065333118754 roc_auc_score cv: 0.7072724549011867
100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [01:49<00:00, 18.29s/it] 60%|██████████████████████████████████████████████████▍ | 3/5 [02:24<02:00, 60.17s/it]
train_auc = [0.652193415725515, 0.6520705216804092, 0.6543583760125465, 0.6999534315349686, 0.7203369488385889, 0.7240553220426698, 0.6521386190868382, 0.6521184588780653, 0.6715928694946971, 0.7268363737342182, 0.75862645117214, 0.7728523165109387, 0.6520426815799355, 0.6546536435899135, 0.7063109894635199, 0.8382894037216906, 0.8879065333118754, 0.9119226551677349] cv_auc = [0.6242118610582885, 0.6240723460069502, 0.628059375057095, 0.6531943371355939, 0.659141907824683, 0.6683784446628658, 0.6241619262758451, 0.6242108370051328, 0.6415291850923088, 0.6676599494631392, 0.6758782686232518, 0.6834200462533926, 0.6240347973912457, 0.6289084451615634, 0.657325806445111, 0.7071402220056214, 0.7072724549011867, 0.7017480132230944] roc_auc_score train: 0.9119226551677349 roc_auc_score cv: 0.7017480132230944
0%| | 0/6 [00:00<?, ?it/s] 17%|██████████████ | 1/6 [00:28<02:22, 28.40s/it]
train_auc = [0.652193415725515, 0.6520705216804092, 0.6543583760125465, 0.6999534315349686, 0.7203369488385889, 0.7240553220426698, 0.6521386190868382, 0.6521184588780653, 0.6715928694946971, 0.7268363737342182, 0.75862645117214, 0.7728523165109387, 0.6520426815799355, 0.6546536435899135, 0.7063109894635199, 0.8382894037216906, 0.8879065333118754, 0.9119226551677349, 0.6520549725622359] cv_auc = [0.6242118610582885, 0.6240723460069502, 0.628059375057095, 0.6531943371355939, 0.659141907824683, 0.6683784446628658, 0.6241619262758451, 0.6242108370051328, 0.6415291850923088, 0.6676599494631392, 0.6758782686232518, 0.6834200462533926, 0.6240347973912457, 0.6289084451615634, 0.657325806445111, 0.7071402220056214, 0.7072724549011867, 0.7017480132230944, 0.6240413968449151] roc_auc_score train: 0.6520549725622359 roc_auc_score cv: 0.6240413968449151
33%|████████████████████████████ | 2/6 [00:57<01:54, 28.66s/it]
train_auc = [0.652193415725515, 0.6520705216804092, 0.6543583760125465, 0.6999534315349686, 0.7203369488385889, 0.7240553220426698, 0.6521386190868382, 0.6521184588780653, 0.6715928694946971, 0.7268363737342182, 0.75862645117214, 0.7728523165109387, 0.6520426815799355, 0.6546536435899135, 0.7063109894635199, 0.8382894037216906, 0.8879065333118754, 0.9119226551677349, 0.6520549725622359, 0.656183054380102] cv_auc = [0.6242118610582885, 0.6240723460069502, 0.628059375057095, 0.6531943371355939, 0.659141907824683, 0.6683784446628658, 0.6241619262758451, 0.6242108370051328, 0.6415291850923088, 0.6676599494631392, 0.6758782686232518, 0.6834200462533926, 0.6240347973912457, 0.6289084451615634, 0.657325806445111, 0.7071402220056214, 0.7072724549011867, 0.7017480132230944, 0.6240413968449151, 0.6298172029356711] roc_auc_score train: 0.656183054380102 roc_auc_score cv: 0.6298172029356711
50%|██████████████████████████████████████████ | 3/6 [01:26<01:26, 28.85s/it]
train_auc = [0.652193415725515, 0.6520705216804092, 0.6543583760125465, 0.6999534315349686, 0.7203369488385889, 0.7240553220426698, 0.6521386190868382, 0.6521184588780653, 0.6715928694946971, 0.7268363737342182, 0.75862645117214, 0.7728523165109387, 0.6520426815799355, 0.6546536435899135, 0.7063109894635199, 0.8382894037216906, 0.8879065333118754, 0.9119226551677349, 0.6520549725622359, 0.656183054380102, 0.7209794484993641] cv_auc = [0.6242118610582885, 0.6240723460069502, 0.628059375057095, 0.6531943371355939, 0.659141907824683, 0.6683784446628658, 0.6241619262758451, 0.6242108370051328, 0.6415291850923088, 0.6676599494631392, 0.6758782686232518, 0.6834200462533926, 0.6240347973912457, 0.6289084451615634, 0.657325806445111, 0.7071402220056214, 0.7072724549011867, 0.7017480132230944, 0.6240413968449151, 0.6298172029356711, 0.6645964538112039] roc_auc_score train: 0.7209794484993641 roc_auc_score cv: 0.6645964538112039
67%|████████████████████████████████████████████████████████ | 4/6 [01:55<00:58, 29.16s/it]
train_auc = [0.652193415725515, 0.6520705216804092, 0.6543583760125465, 0.6999534315349686, 0.7203369488385889, 0.7240553220426698, 0.6521386190868382, 0.6521184588780653, 0.6715928694946971, 0.7268363737342182, 0.75862645117214, 0.7728523165109387, 0.6520426815799355, 0.6546536435899135, 0.7063109894635199, 0.8382894037216906, 0.8879065333118754, 0.9119226551677349, 0.6520549725622359, 0.656183054380102, 0.7209794484993641, 0.8705912846048742] cv_auc = [0.6242118610582885, 0.6240723460069502, 0.628059375057095, 0.6531943371355939, 0.659141907824683, 0.6683784446628658, 0.6241619262758451, 0.6242108370051328, 0.6415291850923088, 0.6676599494631392, 0.6758782686232518, 0.6834200462533926, 0.6240347973912457, 0.6289084451615634, 0.657325806445111, 0.7071402220056214, 0.7072724549011867, 0.7017480132230944, 0.6240413968449151, 0.6298172029356711, 0.6645964538112039, 0.7144543835034205] roc_auc_score train: 0.8705912846048742 roc_auc_score cv: 0.7144543835034205
83%|██████████████████████████████████████████████████████████████████████ | 5/6 [02:27<00:30, 30.05s/it]
train_auc = [0.652193415725515, 0.6520705216804092, 0.6543583760125465, 0.6999534315349686, 0.7203369488385889, 0.7240553220426698, 0.6521386190868382, 0.6521184588780653, 0.6715928694946971, 0.7268363737342182, 0.75862645117214, 0.7728523165109387, 0.6520426815799355, 0.6546536435899135, 0.7063109894635199, 0.8382894037216906, 0.8879065333118754, 0.9119226551677349, 0.6520549725622359, 0.656183054380102, 0.7209794484993641, 0.8705912846048742, 0.9245824142707928] cv_auc = [0.6242118610582885, 0.6240723460069502, 0.628059375057095, 0.6531943371355939, 0.659141907824683, 0.6683784446628658, 0.6241619262758451, 0.6242108370051328, 0.6415291850923088, 0.6676599494631392, 0.6758782686232518, 0.6834200462533926, 0.6240347973912457, 0.6289084451615634, 0.657325806445111, 0.7071402220056214, 0.7072724549011867, 0.7017480132230944, 0.6240413968449151, 0.6298172029356711, 0.6645964538112039, 0.7144543835034205, 0.7106290248133607] roc_auc_score train: 0.9245824142707928 roc_auc_score cv: 0.7106290248133607
100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [02:59<00:00, 29.94s/it] 80%|██████████████████████████████████████████████████████████████████▍ | 4/5 [05:24<01:47, 107.33s/it]
train_auc = [0.652193415725515, 0.6520705216804092, 0.6543583760125465, 0.6999534315349686, 0.7203369488385889, 0.7240553220426698, 0.6521386190868382, 0.6521184588780653, 0.6715928694946971, 0.7268363737342182, 0.75862645117214, 0.7728523165109387, 0.6520426815799355, 0.6546536435899135, 0.7063109894635199, 0.8382894037216906, 0.8879065333118754, 0.9119226551677349, 0.6520549725622359, 0.656183054380102, 0.7209794484993641, 0.8705912846048742, 0.9245824142707928, 0.9477205789541778] cv_auc = [0.6242118610582885, 0.6240723460069502, 0.628059375057095, 0.6531943371355939, 0.659141907824683, 0.6683784446628658, 0.6241619262758451, 0.6242108370051328, 0.6415291850923088, 0.6676599494631392, 0.6758782686232518, 0.6834200462533926, 0.6240347973912457, 0.6289084451615634, 0.657325806445111, 0.7071402220056214, 0.7072724549011867, 0.7017480132230944, 0.6240413968449151, 0.6298172029356711, 0.6645964538112039, 0.7144543835034205, 0.7106290248133607, 0.7007731796382342] roc_auc_score train: 0.9477205789541778 roc_auc_score cv: 0.7007731796382342
0%| | 0/6 [00:00<?, ?it/s] 17%|██████████████ | 1/6 [00:47<03:55, 47.08s/it]
train_auc = [0.652193415725515, 0.6520705216804092, 0.6543583760125465, 0.6999534315349686, 0.7203369488385889, 0.7240553220426698, 0.6521386190868382, 0.6521184588780653, 0.6715928694946971, 0.7268363737342182, 0.75862645117214, 0.7728523165109387, 0.6520426815799355, 0.6546536435899135, 0.7063109894635199, 0.8382894037216906, 0.8879065333118754, 0.9119226551677349, 0.6520549725622359, 0.656183054380102, 0.7209794484993641, 0.8705912846048742, 0.9245824142707928, 0.9477205789541778, 0.6520442909254998] cv_auc = [0.6242118610582885, 0.6240723460069502, 0.628059375057095, 0.6531943371355939, 0.659141907824683, 0.6683784446628658, 0.6241619262758451, 0.6242108370051328, 0.6415291850923088, 0.6676599494631392, 0.6758782686232518, 0.6834200462533926, 0.6240347973912457, 0.6289084451615634, 0.657325806445111, 0.7071402220056214, 0.7072724549011867, 0.7017480132230944, 0.6240413968449151, 0.6298172029356711, 0.6645964538112039, 0.7144543835034205, 0.7106290248133607, 0.7007731796382342, 0.6241576837699148] roc_auc_score train: 0.6520442909254998 roc_auc_score cv: 0.6241576837699148
33%|████████████████████████████ | 2/6 [01:33<03:07, 46.86s/it]
train_auc = [0.652193415725515, 0.6520705216804092, 0.6543583760125465, 0.6999534315349686, 0.7203369488385889, 0.7240553220426698, 0.6521386190868382, 0.6521184588780653, 0.6715928694946971, 0.7268363737342182, 0.75862645117214, 0.7728523165109387, 0.6520426815799355, 0.6546536435899135, 0.7063109894635199, 0.8382894037216906, 0.8879065333118754, 0.9119226551677349, 0.6520549725622359, 0.656183054380102, 0.7209794484993641, 0.8705912846048742, 0.9245824142707928, 0.9477205789541778, 0.6520442909254998, 0.6717936905764929] cv_auc = [0.6242118610582885, 0.6240723460069502, 0.628059375057095, 0.6531943371355939, 0.659141907824683, 0.6683784446628658, 0.6241619262758451, 0.6242108370051328, 0.6415291850923088, 0.6676599494631392, 0.6758782686232518, 0.6834200462533926, 0.6240347973912457, 0.6289084451615634, 0.657325806445111, 0.7071402220056214, 0.7072724549011867, 0.7017480132230944, 0.6240413968449151, 0.6298172029356711, 0.6645964538112039, 0.7144543835034205, 0.7106290248133607, 0.7007731796382342, 0.6241576837699148, 0.6418862545473649] roc_auc_score train: 0.6717936905764929 roc_auc_score cv: 0.6418862545473649
50%|██████████████████████████████████████████ | 3/6 [02:18<02:17, 45.78s/it]
train_auc = [0.652193415725515, 0.6520705216804092, 0.6543583760125465, 0.6999534315349686, 0.7203369488385889, 0.7240553220426698, 0.6521386190868382, 0.6521184588780653, 0.6715928694946971, 0.7268363737342182, 0.75862645117214, 0.7728523165109387, 0.6520426815799355, 0.6546536435899135, 0.7063109894635199, 0.8382894037216906, 0.8879065333118754, 0.9119226551677349, 0.6520549725622359, 0.656183054380102, 0.7209794484993641, 0.8705912846048742, 0.9245824142707928, 0.9477205789541778, 0.6520442909254998, 0.6717936905764929, 0.7318935686133439] cv_auc = [0.6242118610582885, 0.6240723460069502, 0.628059375057095, 0.6531943371355939, 0.659141907824683, 0.6683784446628658, 0.6241619262758451, 0.6242108370051328, 0.6415291850923088, 0.6676599494631392, 0.6758782686232518, 0.6834200462533926, 0.6240347973912457, 0.6289084451615634, 0.657325806445111, 0.7071402220056214, 0.7072724549011867, 0.7017480132230944, 0.6240413968449151, 0.6298172029356711, 0.6645964538112039, 0.7144543835034205, 0.7106290248133607, 0.7007731796382342, 0.6241576837699148, 0.6418862545473649, 0.6695821947650273] roc_auc_score train: 0.7318935686133439 roc_auc_score cv: 0.6695821947650273
67%|████████████████████████████████████████████████████████ | 4/6 [03:01<01:29, 44.66s/it]
train_auc = [0.652193415725515, 0.6520705216804092, 0.6543583760125465, 0.6999534315349686, 0.7203369488385889, 0.7240553220426698, 0.6521386190868382, 0.6521184588780653, 0.6715928694946971, 0.7268363737342182, 0.75862645117214, 0.7728523165109387, 0.6520426815799355, 0.6546536435899135, 0.7063109894635199, 0.8382894037216906, 0.8879065333118754, 0.9119226551677349, 0.6520549725622359, 0.656183054380102, 0.7209794484993641, 0.8705912846048742, 0.9245824142707928, 0.9477205789541778, 0.6520442909254998, 0.6717936905764929, 0.7318935686133439, 0.8962666811823325] cv_auc = [0.6242118610582885, 0.6240723460069502, 0.628059375057095, 0.6531943371355939, 0.659141907824683, 0.6683784446628658, 0.6241619262758451, 0.6242108370051328, 0.6415291850923088, 0.6676599494631392, 0.6758782686232518, 0.6834200462533926, 0.6240347973912457, 0.6289084451615634, 0.657325806445111, 0.7071402220056214, 0.7072724549011867, 0.7017480132230944, 0.6240413968449151, 0.6298172029356711, 0.6645964538112039, 0.7144543835034205, 0.7106290248133607, 0.7007731796382342, 0.6241576837699148, 0.6418862545473649, 0.6695821947650273, 0.7173596223057893] roc_auc_score train: 0.8962666811823325 roc_auc_score cv: 0.7173596223057893
83%|██████████████████████████████████████████████████████████████████████ | 5/6 [03:43<00:43, 43.63s/it]
train_auc = [0.652193415725515, 0.6520705216804092, 0.6543583760125465, 0.6999534315349686, 0.7203369488385889, 0.7240553220426698, 0.6521386190868382, 0.6521184588780653, 0.6715928694946971, 0.7268363737342182, 0.75862645117214, 0.7728523165109387, 0.6520426815799355, 0.6546536435899135, 0.7063109894635199, 0.8382894037216906, 0.8879065333118754, 0.9119226551677349, 0.6520549725622359, 0.656183054380102, 0.7209794484993641, 0.8705912846048742, 0.9245824142707928, 0.9477205789541778, 0.6520442909254998, 0.6717936905764929, 0.7318935686133439, 0.8962666811823325, 0.9480443216924004] cv_auc = [0.6242118610582885, 0.6240723460069502, 0.628059375057095, 0.6531943371355939, 0.659141907824683, 0.6683784446628658, 0.6241619262758451, 0.6242108370051328, 0.6415291850923088, 0.6676599494631392, 0.6758782686232518, 0.6834200462533926, 0.6240347973912457, 0.6289084451615634, 0.657325806445111, 0.7071402220056214, 0.7072724549011867, 0.7017480132230944, 0.6240413968449151, 0.6298172029356711, 0.6645964538112039, 0.7144543835034205, 0.7106290248133607, 0.7007731796382342, 0.6241576837699148, 0.6418862545473649, 0.6695821947650273, 0.7173596223057893, 0.7119341236683001] roc_auc_score train: 0.9480443216924004 roc_auc_score cv: 0.7119341236683001
100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [04:24<00:00, 44.15s/it] 100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [09:49<00:00, 117.80s/it]
train_auc = [0.652193415725515, 0.6520705216804092, 0.6543583760125465, 0.6999534315349686, 0.7203369488385889, 0.7240553220426698, 0.6521386190868382, 0.6521184588780653, 0.6715928694946971, 0.7268363737342182, 0.75862645117214, 0.7728523165109387, 0.6520426815799355, 0.6546536435899135, 0.7063109894635199, 0.8382894037216906, 0.8879065333118754, 0.9119226551677349, 0.6520549725622359, 0.656183054380102, 0.7209794484993641, 0.8705912846048742, 0.9245824142707928, 0.9477205789541778, 0.6520442909254998, 0.6717936905764929, 0.7318935686133439, 0.8962666811823325, 0.9480443216924004, 0.9659529968538871] cv_auc = [0.6242118610582885, 0.6240723460069502, 0.628059375057095, 0.6531943371355939, 0.659141907824683, 0.6683784446628658, 0.6241619262758451, 0.6242108370051328, 0.6415291850923088, 0.6676599494631392, 0.6758782686232518, 0.6834200462533926, 0.6240347973912457, 0.6289084451615634, 0.657325806445111, 0.7071402220056214, 0.7072724549011867, 0.7017480132230944, 0.6240413968449151, 0.6298172029356711, 0.6645964538112039, 0.7144543835034205, 0.7106290248133607, 0.7007731796382342, 0.6241576837699148, 0.6418862545473649, 0.6695821947650273, 0.7173596223057893, 0.7119341236683001, 0.7010599307766074] roc_auc_score train: 0.9659529968538871 roc_auc_score cv: 0.7010599307766074
from sklearn.metrics import roc_curve, auc
from xgboost import XGBClassifier
estimators = 100
l_rate = 0.01
clf = XGBClassifier(n_estimators = estimators,learning_rate = l_rate)
clf.fit(X_tr_tfidf,y_train)
train_pred = clf.predict_proba(X_tr_tfidf)
test_pred = clf.predict_proba(X_te_tfidf)
train_fpr1, train_tpr1, tr_thresholds1 = roc_curve(y_train,np.transpose(train_pred)[1])
test_fpr, test_tpr, te_thresholds = roc_curve(y_test,np.transpose(test_pred)[1])
plt.plot(train_fpr, train_tpr, label="train AUC ="+str(auc(train_fpr1, train_tpr1)))
plt.plot(test_fpr, test_tpr, label="test AUC ="+str(auc(test_fpr, test_tpr)))
plt.legend()
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("ERROR PLOTS")
plt.grid()
plt.show()
THE ACCURACY OBTAINED FOR TFIDF VECTORIZED DATA IS 67%
from xgboost import XGBClassifier
from tqdm import tqdm
estimators = [5,10,50, 75, 100]
l_rate = [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3]
train_auc = []
cv_auc = []
for i in tqdm(estimators):
for j in tqdm(l_rate):
pred_list = []
pred_list_cv = []
clf = XGBClassifier(n_estimators = i,learning_rate = j,n_jobs = -1)
clf.fit(X_tr_tfidfw2v,y_train)
train_pred = clf.predict_proba(X_tr_tfidfw2v)
cv_pred = clf.predict_proba(X_cr_tfidfw2v)
train_auc.append(roc_auc_score(y_train,np.transpose(train_pred)[1]))
cv_auc.append(roc_auc_score(y_cv,np.transpose(cv_pred)[1]))
print("train_auc = ",train_auc,"\ncv_auc = ",cv_auc)
print("roc_auc_score train: ",roc_auc_score(y_train,np.transpose(train_pred)[1]))
print("roc_auc_score cv: ",roc_auc_score(y_cv,np.transpose(cv_pred)[1]))
0%| | 0/5 [00:00<?, ?it/s] 0%| | 0/6 [00:00<?, ?it/s] 33%|████████████████████████████ | 2/6 [00:00<00:00, 12.69it/s]
train_auc = [0.5917905034411595] cv_auc = [0.5920054705894859] roc_auc_score train: 0.5917905034411595 roc_auc_score cv: 0.5920054705894859 train_auc = [0.5917905034411595, 0.592053694552523] cv_auc = [0.5920054705894859, 0.5922980897149888] roc_auc_score train: 0.592053694552523 roc_auc_score cv: 0.5922980897149888 train_auc = [0.5917905034411595, 0.592053694552523, 0.593175720024361] cv_auc = [0.5920054705894859, 0.5922980897149888, 0.5929314909739304] roc_auc_score train: 0.593175720024361 roc_auc_score cv: 0.5929314909739304
67%|████████████████████████████████████████████████████████ | 4/6 [00:00<00:00, 12.46it/s] 100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 12.30it/s] 20%|████████████████▊ | 1/5 [00:00<00:01, 2.04it/s]
train_auc = [0.5917905034411595, 0.592053694552523, 0.593175720024361, 0.6003048802615345] cv_auc = [0.5920054705894859, 0.5922980897149888, 0.5929314909739304, 0.59805191929993] roc_auc_score train: 0.6003048802615345 roc_auc_score cv: 0.59805191929993 train_auc = [0.5917905034411595, 0.592053694552523, 0.593175720024361, 0.6003048802615345, 0.6029747411635883] cv_auc = [0.5920054705894859, 0.5922980897149888, 0.5929314909739304, 0.59805191929993, 0.5997506284272865] roc_auc_score train: 0.6029747411635883 roc_auc_score cv: 0.5997506284272865 train_auc = [0.5917905034411595, 0.592053694552523, 0.593175720024361, 0.6003048802615345, 0.6029747411635883, 0.6078402582526294] cv_auc = [0.5920054705894859, 0.5922980897149888, 0.5929314909739304, 0.59805191929993, 0.5997506284272865, 0.600254088719154] roc_auc_score train: 0.6078402582526294 roc_auc_score cv: 0.600254088719154
0%| | 0/6 [00:00<?, ?it/s] 17%|██████████████ | 1/6 [00:00<00:00, 7.32it/s]
train_auc = [0.5917905034411595, 0.592053694552523, 0.593175720024361, 0.6003048802615345, 0.6029747411635883, 0.6078402582526294, 0.5918966847481374] cv_auc = [0.5920054705894859, 0.5922980897149888, 0.5929314909739304, 0.59805191929993, 0.5997506284272865, 0.600254088719154, 0.5923592078080836] roc_auc_score train: 0.5918966847481374 roc_auc_score cv: 0.5923592078080836 train_auc = [0.5917905034411595, 0.592053694552523, 0.593175720024361, 0.6003048802615345, 0.6029747411635883, 0.6078402582526294, 0.5918966847481374, 0.5929442464318283]
33%|████████████████████████████ | 2/6 [00:00<00:00, 6.18it/s]
cv_auc = [0.5920054705894859, 0.5922980897149888, 0.5929314909739304, 0.59805191929993, 0.5997506284272865, 0.600254088719154, 0.5923592078080836, 0.5937809999583227] roc_auc_score train: 0.5929442464318283 roc_auc_score cv: 0.5937809999583227 train_auc = [0.5917905034411595, 0.592053694552523, 0.593175720024361, 0.6003048802615345, 0.6029747411635883, 0.6078402582526294, 0.5918966847481374, 0.5929442464318283, 0.5943472525946754] cv_auc = [0.5920054705894859, 0.5922980897149888, 0.5929314909739304, 0.59805191929993, 0.5997506284272865, 0.600254088719154, 0.5923592078080836, 0.5937809999583227, 0.5945592315921243] roc_auc_score train: 0.5943472525946754
50%|██████████████████████████████████████████ | 3/6 [00:00<00:00, 5.95it/s] 67%|████████████████████████████████████████████████████████ | 4/6 [00:00<00:00, 6.69it/s]
roc_auc_score cv: 0.5945592315921243 train_auc = [0.5917905034411595, 0.592053694552523, 0.593175720024361, 0.6003048802615345, 0.6029747411635883, 0.6078402582526294, 0.5918966847481374, 0.5929442464318283, 0.5943472525946754, 0.6047832708528584] cv_auc = [0.5920054705894859, 0.5922980897149888, 0.5929314909739304, 0.59805191929993, 0.5997506284272865, 0.600254088719154, 0.5923592078080836, 0.5937809999583227, 0.5945592315921243, 0.5989187396590949] roc_auc_score train: 0.6047832708528584 roc_auc_score cv: 0.5989187396590949
83%|██████████████████████████████████████████████████████████████████████ | 5/6 [00:00<00:00, 7.14it/s] 100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 6.85it/s] 40%|█████████████████████████████████▌ | 2/5 [00:01<00:02, 1.39it/s]
train_auc = [0.5917905034411595, 0.592053694552523, 0.593175720024361, 0.6003048802615345, 0.6029747411635883, 0.6078402582526294, 0.5918966847481374, 0.5929442464318283, 0.5943472525946754, 0.6047832708528584, 0.6152479891069395] cv_auc = [0.5920054705894859, 0.5922980897149888, 0.5929314909739304, 0.59805191929993, 0.5997506284272865, 0.600254088719154, 0.5923592078080836, 0.5937809999583227, 0.5945592315921243, 0.5989187396590949, 0.6028601089423508] roc_auc_score train: 0.6152479891069395 roc_auc_score cv: 0.6028601089423508 train_auc = [0.5917905034411595, 0.592053694552523, 0.593175720024361, 0.6003048802615345, 0.6029747411635883, 0.6078402582526294, 0.5918966847481374, 0.5929442464318283, 0.5943472525946754, 0.6047832708528584, 0.6152479891069395, 0.6235164437249723] cv_auc = [0.5920054705894859, 0.5922980897149888, 0.5929314909739304, 0.59805191929993, 0.5997506284272865, 0.600254088719154, 0.5923592078080836, 0.5937809999583227, 0.5945592315921243, 0.5989187396590949, 0.6028601089423508, 0.6042184748161922] roc_auc_score train: 0.6235164437249723 roc_auc_score cv: 0.6042184748161922
0%| | 0/6 [00:00<?, ?it/s] 17%|██████████████ | 1/6 [00:00<00:02, 2.49it/s]
train_auc = [0.5917905034411595, 0.592053694552523, 0.593175720024361, 0.6003048802615345, 0.6029747411635883, 0.6078402582526294, 0.5918966847481374, 0.5929442464318283, 0.5943472525946754, 0.6047832708528584, 0.6152479891069395, 0.6235164437249723, 0.5918479113219038] cv_auc = [0.5920054705894859, 0.5922980897149888, 0.5929314909739304, 0.59805191929993, 0.5997506284272865, 0.600254088719154, 0.5923592078080836, 0.5937809999583227, 0.5945592315921243, 0.5989187396590949, 0.6028601089423508, 0.6042184748161922, 0.5923810705302144] roc_auc_score train: 0.5918479113219038 roc_auc_score cv: 0.5923810705302144
33%|████████████████████████████ | 2/6 [00:00<00:01, 2.52it/s]
train_auc = [0.5917905034411595, 0.592053694552523, 0.593175720024361, 0.6003048802615345, 0.6029747411635883, 0.6078402582526294, 0.5918966847481374, 0.5929442464318283, 0.5943472525946754, 0.6047832708528584, 0.6152479891069395, 0.6235164437249723, 0.5918479113219038, 0.5927634277798288] cv_auc = [0.5920054705894859, 0.5922980897149888, 0.5929314909739304, 0.59805191929993, 0.5997506284272865, 0.600254088719154, 0.5923592078080836, 0.5937809999583227, 0.5945592315921243, 0.5989187396590949, 0.6028601089423508, 0.6042184748161922, 0.5923810705302144, 0.5925035342837717] roc_auc_score train: 0.5927634277798288 roc_auc_score cv: 0.5925035342837717
50%|██████████████████████████████████████████ | 3/6 [00:01<00:01, 2.52it/s]
train_auc = [0.5917905034411595, 0.592053694552523, 0.593175720024361, 0.6003048802615345, 0.6029747411635883, 0.6078402582526294, 0.5918966847481374, 0.5929442464318283, 0.5943472525946754, 0.6047832708528584, 0.6152479891069395, 0.6235164437249723, 0.5918479113219038, 0.5927634277798288, 0.6015213561734181] cv_auc = [0.5920054705894859, 0.5922980897149888, 0.5929314909739304, 0.59805191929993, 0.5997506284272865, 0.600254088719154, 0.5923592078080836, 0.5937809999583227, 0.5945592315921243, 0.5989187396590949, 0.6028601089423508, 0.6042184748161922, 0.5923810705302144, 0.5925035342837717, 0.599063553779143] roc_auc_score train: 0.6015213561734181 roc_auc_score cv: 0.599063553779143
67%|████████████████████████████████████████████████████████ | 4/6 [00:01<00:00, 2.64it/s]
train_auc = [0.5917905034411595, 0.592053694552523, 0.593175720024361, 0.6003048802615345, 0.6029747411635883, 0.6078402582526294, 0.5918966847481374, 0.5929442464318283, 0.5943472525946754, 0.6047832708528584, 0.6152479891069395, 0.6235164437249723, 0.5918479113219038, 0.5927634277798288, 0.6015213561734181, 0.6405294313014557] cv_auc = [0.5920054705894859, 0.5922980897149888, 0.5929314909739304, 0.59805191929993, 0.5997506284272865, 0.600254088719154, 0.5923592078080836, 0.5937809999583227, 0.5945592315921243, 0.5989187396590949, 0.6028601089423508, 0.6042184748161922, 0.5923810705302144, 0.5925035342837717, 0.599063553779143, 0.6033502565431957] roc_auc_score train: 0.6405294313014557 roc_auc_score cv: 0.6033502565431957
83%|██████████████████████████████████████████████████████████████████████ | 5/6 [00:01<00:00, 2.64it/s]
train_auc = [0.5917905034411595, 0.592053694552523, 0.593175720024361, 0.6003048802615345, 0.6029747411635883, 0.6078402582526294, 0.5918966847481374, 0.5929442464318283, 0.5943472525946754, 0.6047832708528584, 0.6152479891069395, 0.6235164437249723, 0.5918479113219038, 0.5927634277798288, 0.6015213561734181, 0.6405294313014557, 0.6665104418126911] cv_auc = [0.5920054705894859, 0.5922980897149888, 0.5929314909739304, 0.59805191929993, 0.5997506284272865, 0.600254088719154, 0.5923592078080836, 0.5937809999583227, 0.5945592315921243, 0.5989187396590949, 0.6028601089423508, 0.6042184748161922, 0.5923810705302144, 0.5925035342837717, 0.599063553779143, 0.6033502565431957, 0.5984265114423148] roc_auc_score train: 0.6665104418126911 roc_auc_score cv: 0.5984265114423148
100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:02<00:00, 2.63it/s] 60%|██████████████████████████████████████████████████▍ | 3/5 [00:03<00:02, 1.44s/it]
train_auc = [0.5917905034411595, 0.592053694552523, 0.593175720024361, 0.6003048802615345, 0.6029747411635883, 0.6078402582526294, 0.5918966847481374, 0.5929442464318283, 0.5943472525946754, 0.6047832708528584, 0.6152479891069395, 0.6235164437249723, 0.5918479113219038, 0.5927634277798288, 0.6015213561734181, 0.6405294313014557, 0.6665104418126911, 0.685062997201001] cv_auc = [0.5920054705894859, 0.5922980897149888, 0.5929314909739304, 0.59805191929993, 0.5997506284272865, 0.600254088719154, 0.5923592078080836, 0.5937809999583227, 0.5945592315921243, 0.5989187396590949, 0.6028601089423508, 0.6042184748161922, 0.5923810705302144, 0.5925035342837717, 0.599063553779143, 0.6033502565431957, 0.5984265114423148, 0.5945638804683544] roc_auc_score train: 0.685062997201001 roc_auc_score cv: 0.5945638804683544
0%| | 0/6 [00:00<?, ?it/s] 17%|██████████████ | 1/6 [00:00<00:02, 2.04it/s]
train_auc = [0.5917905034411595, 0.592053694552523, 0.593175720024361, 0.6003048802615345, 0.6029747411635883, 0.6078402582526294, 0.5918966847481374, 0.5929442464318283, 0.5943472525946754, 0.6047832708528584, 0.6152479891069395, 0.6235164437249723, 0.5918479113219038, 0.5927634277798288, 0.6015213561734181, 0.6405294313014557, 0.6665104418126911, 0.685062997201001, 0.5928828940766616] cv_auc = [0.5920054705894859, 0.5922980897149888, 0.5929314909739304, 0.59805191929993, 0.5997506284272865, 0.600254088719154, 0.5923592078080836, 0.5937809999583227, 0.5945592315921243, 0.5989187396590949, 0.6028601089423508, 0.6042184748161922, 0.5923810705302144, 0.5925035342837717, 0.599063553779143, 0.6033502565431957, 0.5984265114423148, 0.5945638804683544, 0.5939025046779723] roc_auc_score train: 0.5928828940766616 roc_auc_score cv: 0.5939025046779723
33%|████████████████████████████ | 2/6 [00:00<00:01, 2.06it/s]
train_auc = [0.5917905034411595, 0.592053694552523, 0.593175720024361, 0.6003048802615345, 0.6029747411635883, 0.6078402582526294, 0.5918966847481374, 0.5929442464318283, 0.5943472525946754, 0.6047832708528584, 0.6152479891069395, 0.6235164437249723, 0.5918479113219038, 0.5927634277798288, 0.6015213561734181, 0.6405294313014557, 0.6665104418126911, 0.685062997201001, 0.5928828940766616, 0.5937141486719744] cv_auc = [0.5920054705894859, 0.5922980897149888, 0.5929314909739304, 0.59805191929993, 0.5997506284272865, 0.600254088719154, 0.5923592078080836, 0.5937809999583227, 0.5945592315921243, 0.5989187396590949, 0.6028601089423508, 0.6042184748161922, 0.5923810705302144, 0.5925035342837717, 0.599063553779143, 0.6033502565431957, 0.5984265114423148, 0.5945638804683544, 0.5939025046779723, 0.5936027009255684] roc_auc_score train: 0.5937141486719744 roc_auc_score cv: 0.5936027009255684
50%|██████████████████████████████████████████ | 3/6 [00:01<00:01, 2.04it/s]
train_auc = [0.5917905034411595, 0.592053694552523, 0.593175720024361, 0.6003048802615345, 0.6029747411635883, 0.6078402582526294, 0.5918966847481374, 0.5929442464318283, 0.5943472525946754, 0.6047832708528584, 0.6152479891069395, 0.6235164437249723, 0.5918479113219038, 0.5927634277798288, 0.6015213561734181, 0.6405294313014557, 0.6665104418126911, 0.685062997201001, 0.5928828940766616, 0.5937141486719744, 0.6026023275554673] cv_auc = [0.5920054705894859, 0.5922980897149888, 0.5929314909739304, 0.59805191929993, 0.5997506284272865, 0.600254088719154, 0.5923592078080836, 0.5937809999583227, 0.5945592315921243, 0.5989187396590949, 0.6028601089423508, 0.6042184748161922, 0.5923810705302144, 0.5925035342837717, 0.599063553779143, 0.6033502565431957, 0.5984265114423148, 0.5945638804683544, 0.5939025046779723, 0.5936027009255684, 0.5995533437741242] roc_auc_score train: 0.6026023275554673 roc_auc_score cv: 0.5995533437741242
67%|████████████████████████████████████████████████████████ | 4/6 [00:01<00:00, 2.04it/s]
train_auc = [0.5917905034411595, 0.592053694552523, 0.593175720024361, 0.6003048802615345, 0.6029747411635883, 0.6078402582526294, 0.5918966847481374, 0.5929442464318283, 0.5943472525946754, 0.6047832708528584, 0.6152479891069395, 0.6235164437249723, 0.5918479113219038, 0.5927634277798288, 0.6015213561734181, 0.6405294313014557, 0.6665104418126911, 0.685062997201001, 0.5928828940766616, 0.5937141486719744, 0.6026023275554673, 0.6550547848052534] cv_auc = [0.5920054705894859, 0.5922980897149888, 0.5929314909739304, 0.59805191929993, 0.5997506284272865, 0.600254088719154, 0.5923592078080836, 0.5937809999583227, 0.5945592315921243, 0.5989187396590949, 0.6028601089423508, 0.6042184748161922, 0.5923810705302144, 0.5925035342837717, 0.599063553779143, 0.6033502565431957, 0.5984265114423148, 0.5945638804683544, 0.5939025046779723, 0.5936027009255684, 0.5995533437741242, 0.6023384595158627] roc_auc_score train: 0.6550547848052534 roc_auc_score cv: 0.6023384595158627
83%|██████████████████████████████████████████████████████████████████████ | 5/6 [00:02<00:00, 1.76it/s]
train_auc = [0.5917905034411595, 0.592053694552523, 0.593175720024361, 0.6003048802615345, 0.6029747411635883, 0.6078402582526294, 0.5918966847481374, 0.5929442464318283, 0.5943472525946754, 0.6047832708528584, 0.6152479891069395, 0.6235164437249723, 0.5918479113219038, 0.5927634277798288, 0.6015213561734181, 0.6405294313014557, 0.6665104418126911, 0.685062997201001, 0.5928828940766616, 0.5937141486719744, 0.6026023275554673, 0.6550547848052534, 0.684061234809829] cv_auc = [0.5920054705894859, 0.5922980897149888, 0.5929314909739304, 0.59805191929993, 0.5997506284272865, 0.600254088719154, 0.5923592078080836, 0.5937809999583227, 0.5945592315921243, 0.5989187396590949, 0.6028601089423508, 0.6042184748161922, 0.5923810705302144, 0.5925035342837717, 0.599063553779143, 0.6033502565431957, 0.5984265114423148, 0.5945638804683544, 0.5939025046779723, 0.5936027009255684, 0.5995533437741242, 0.6023384595158627, 0.5942285924613693] roc_auc_score train: 0.684061234809829 roc_auc_score cv: 0.5942285924613693
100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:03<00:00, 1.88it/s] 80%|███████████████████████████████████████████████████████████████████▏ | 4/5 [00:06<00:02, 2.13s/it]
train_auc = [0.5917905034411595, 0.592053694552523, 0.593175720024361, 0.6003048802615345, 0.6029747411635883, 0.6078402582526294, 0.5918966847481374, 0.5929442464318283, 0.5943472525946754, 0.6047832708528584, 0.6152479891069395, 0.6235164437249723, 0.5918479113219038, 0.5927634277798288, 0.6015213561734181, 0.6405294313014557, 0.6665104418126911, 0.685062997201001, 0.5928828940766616, 0.5937141486719744, 0.6026023275554673, 0.6550547848052534, 0.684061234809829, 0.7071903548133632] cv_auc = [0.5920054705894859, 0.5922980897149888, 0.5929314909739304, 0.59805191929993, 0.5997506284272865, 0.600254088719154, 0.5923592078080836, 0.5937809999583227, 0.5945592315921243, 0.5989187396590949, 0.6028601089423508, 0.6042184748161922, 0.5923810705302144, 0.5925035342837717, 0.599063553779143, 0.6033502565431957, 0.5984265114423148, 0.5945638804683544, 0.5939025046779723, 0.5936027009255684, 0.5995533437741242, 0.6023384595158627, 0.5942285924613693, 0.5902543396934511] roc_auc_score train: 0.7071903548133632 roc_auc_score cv: 0.5902543396934511
0%| | 0/6 [00:00<?, ?it/s] 17%|██████████████ | 1/6 [00:00<00:03, 1.39it/s]
train_auc = [0.5917905034411595, 0.592053694552523, 0.593175720024361, 0.6003048802615345, 0.6029747411635883, 0.6078402582526294, 0.5918966847481374, 0.5929442464318283, 0.5943472525946754, 0.6047832708528584, 0.6152479891069395, 0.6235164437249723, 0.5918479113219038, 0.5927634277798288, 0.6015213561734181, 0.6405294313014557, 0.6665104418126911, 0.685062997201001, 0.5928828940766616, 0.5937141486719744, 0.6026023275554673, 0.6550547848052534, 0.684061234809829, 0.7071903548133632, 0.5929424950851849] cv_auc = [0.5920054705894859, 0.5922980897149888, 0.5929314909739304, 0.59805191929993, 0.5997506284272865, 0.600254088719154, 0.5923592078080836, 0.5937809999583227, 0.5945592315921243, 0.5989187396590949, 0.6028601089423508, 0.6042184748161922, 0.5923810705302144, 0.5925035342837717, 0.599063553779143, 0.6033502565431957, 0.5984265114423148, 0.5945638804683544, 0.5939025046779723, 0.5936027009255684, 0.5995533437741242, 0.6023384595158627, 0.5942285924613693, 0.5902543396934511, 0.5937517575515467] roc_auc_score train: 0.5929424950851849 roc_auc_score cv: 0.5937517575515467
33%|████████████████████████████ | 2/6 [00:02<00:04, 1.13s/it]
train_auc = [0.5917905034411595, 0.592053694552523, 0.593175720024361, 0.6003048802615345, 0.6029747411635883, 0.6078402582526294, 0.5918966847481374, 0.5929442464318283, 0.5943472525946754, 0.6047832708528584, 0.6152479891069395, 0.6235164437249723, 0.5918479113219038, 0.5927634277798288, 0.6015213561734181, 0.6405294313014557, 0.6665104418126911, 0.685062997201001, 0.5928828940766616, 0.5937141486719744, 0.6026023275554673, 0.6550547848052534, 0.684061234809829, 0.7071903548133632, 0.5929424950851849, 0.5941955520528625] cv_auc = [0.5920054705894859, 0.5922980897149888, 0.5929314909739304, 0.59805191929993, 0.5997506284272865, 0.600254088719154, 0.5923592078080836, 0.5937809999583227, 0.5945592315921243, 0.5989187396590949, 0.6028601089423508, 0.6042184748161922, 0.5923810705302144, 0.5925035342837717, 0.599063553779143, 0.6033502565431957, 0.5984265114423148, 0.5945638804683544, 0.5939025046779723, 0.5936027009255684, 0.5995533437741242, 0.6023384595158627, 0.5942285924613693, 0.5902543396934511, 0.5937517575515467, 0.5943088099585561] roc_auc_score train: 0.5941955520528625 roc_auc_score cv: 0.5943088099585561
50%|██████████████████████████████████████████ | 3/6 [00:02<00:02, 1.12it/s]
train_auc = [0.5917905034411595, 0.592053694552523, 0.593175720024361, 0.6003048802615345, 0.6029747411635883, 0.6078402582526294, 0.5918966847481374, 0.5929442464318283, 0.5943472525946754, 0.6047832708528584, 0.6152479891069395, 0.6235164437249723, 0.5918479113219038, 0.5927634277798288, 0.6015213561734181, 0.6405294313014557, 0.6665104418126911, 0.685062997201001, 0.5928828940766616, 0.5937141486719744, 0.6026023275554673, 0.6550547848052534, 0.684061234809829, 0.7071903548133632, 0.5929424950851849, 0.5941955520528625, 0.6044197757014064] cv_auc = [0.5920054705894859, 0.5922980897149888, 0.5929314909739304, 0.59805191929993, 0.5997506284272865, 0.600254088719154, 0.5923592078080836, 0.5937809999583227, 0.5945592315921243, 0.5989187396590949, 0.6028601089423508, 0.6042184748161922, 0.5923810705302144, 0.5925035342837717, 0.599063553779143, 0.6033502565431957, 0.5984265114423148, 0.5945638804683544, 0.5939025046779723, 0.5936027009255684, 0.5995533437741242, 0.6023384595158627, 0.5942285924613693, 0.5902543396934511, 0.5937517575515467, 0.5943088099585561, 0.5998820648370641] roc_auc_score train: 0.6044197757014064 roc_auc_score cv: 0.5998820648370641
67%|████████████████████████████████████████████████████████ | 4/6 [00:03<00:01, 1.29it/s]
train_auc = [0.5917905034411595, 0.592053694552523, 0.593175720024361, 0.6003048802615345, 0.6029747411635883, 0.6078402582526294, 0.5918966847481374, 0.5929442464318283, 0.5943472525946754, 0.6047832708528584, 0.6152479891069395, 0.6235164437249723, 0.5918479113219038, 0.5927634277798288, 0.6015213561734181, 0.6405294313014557, 0.6665104418126911, 0.685062997201001, 0.5928828940766616, 0.5937141486719744, 0.6026023275554673, 0.6550547848052534, 0.684061234809829, 0.7071903548133632, 0.5929424950851849, 0.5941955520528625, 0.6044197757014064, 0.6659787069381727] cv_auc = [0.5920054705894859, 0.5922980897149888, 0.5929314909739304, 0.59805191929993, 0.5997506284272865, 0.600254088719154, 0.5923592078080836, 0.5937809999583227, 0.5945592315921243, 0.5989187396590949, 0.6028601089423508, 0.6042184748161922, 0.5923810705302144, 0.5925035342837717, 0.599063553779143, 0.6033502565431957, 0.5984265114423148, 0.5945638804683544, 0.5939025046779723, 0.5936027009255684, 0.5995533437741242, 0.6023384595158627, 0.5942285924613693, 0.5902543396934511, 0.5937517575515467, 0.5943088099585561, 0.5998820648370641, 0.5999272044499693] roc_auc_score train: 0.6659787069381727 roc_auc_score cv: 0.5999272044499693
83%|██████████████████████████████████████████████████████████████████████ | 5/6 [00:03<00:00, 1.40it/s]
train_auc = [0.5917905034411595, 0.592053694552523, 0.593175720024361, 0.6003048802615345, 0.6029747411635883, 0.6078402582526294, 0.5918966847481374, 0.5929442464318283, 0.5943472525946754, 0.6047832708528584, 0.6152479891069395, 0.6235164437249723, 0.5918479113219038, 0.5927634277798288, 0.6015213561734181, 0.6405294313014557, 0.6665104418126911, 0.685062997201001, 0.5928828940766616, 0.5937141486719744, 0.6026023275554673, 0.6550547848052534, 0.684061234809829, 0.7071903548133632, 0.5929424950851849, 0.5941955520528625, 0.6044197757014064, 0.6659787069381727, 0.6989787992388743] cv_auc = [0.5920054705894859, 0.5922980897149888, 0.5929314909739304, 0.59805191929993, 0.5997506284272865, 0.600254088719154, 0.5923592078080836, 0.5937809999583227, 0.5945592315921243, 0.5989187396590949, 0.6028601089423508, 0.6042184748161922, 0.5923810705302144, 0.5925035342837717, 0.599063553779143, 0.6033502565431957, 0.5984265114423148, 0.5945638804683544, 0.5939025046779723, 0.5936027009255684, 0.5995533437741242, 0.6023384595158627, 0.5942285924613693, 0.5902543396934511, 0.5937517575515467, 0.5943088099585561, 0.5998820648370641, 0.5999272044499693, 0.5910151786784201] roc_auc_score train: 0.6989787992388743 roc_auc_score cv: 0.5910151786784201
100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00, 1.32it/s] 100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:11<00:00, 2.28s/it]
train_auc = [0.5917905034411595, 0.592053694552523, 0.593175720024361, 0.6003048802615345, 0.6029747411635883, 0.6078402582526294, 0.5918966847481374, 0.5929442464318283, 0.5943472525946754, 0.6047832708528584, 0.6152479891069395, 0.6235164437249723, 0.5918479113219038, 0.5927634277798288, 0.6015213561734181, 0.6405294313014557, 0.6665104418126911, 0.685062997201001, 0.5928828940766616, 0.5937141486719744, 0.6026023275554673, 0.6550547848052534, 0.684061234809829, 0.7071903548133632, 0.5929424950851849, 0.5941955520528625, 0.6044197757014064, 0.6659787069381727, 0.6989787992388743, 0.7228748236819934] cv_auc = [0.5920054705894859, 0.5922980897149888, 0.5929314909739304, 0.59805191929993, 0.5997506284272865, 0.600254088719154, 0.5923592078080836, 0.5937809999583227, 0.5945592315921243, 0.5989187396590949, 0.6028601089423508, 0.6042184748161922, 0.5923810705302144, 0.5925035342837717, 0.599063553779143, 0.6033502565431957, 0.5984265114423148, 0.5945638804683544, 0.5939025046779723, 0.5936027009255684, 0.5995533437741242, 0.6023384595158627, 0.5942285924613693, 0.5902543396934511, 0.5937517575515467, 0.5943088099585561, 0.5998820648370641, 0.5999272044499693, 0.5910151786784201, 0.5887998753711055] roc_auc_score train: 0.7228748236819934 roc_auc_score cv: 0.5887998753711055
from sklearn.metrics import roc_curve, auc
from xgboost import XGBClassifier
estimators = 100
l_rate = 0.01
clf = XGBClassifier(n_estimators = estimators,learning_rate = l_rate)
clf.fit(X_tr_tfidfw2v,y_train)
train_pred2 = clf.predict_proba(X_tr_tfidfw2v)
test_pred = clf.predict_proba(X_te_tfidfw2v)
train_fpr2, train_tpr2, tr_thresholds2 = roc_curve(y_train,np.transpose(train_pred2)[1])
test_fpr, test_tpr, te_thresholds = roc_curve(y_test,np.transpose(test_pred)[1])
plt.plot(train_fpr, train_tpr, label="train AUC ="+str(auc(train_fpr1, train_tpr1)))
plt.plot(test_fpr, test_tpr, label="test AUC ="+str(auc(test_fpr, test_tpr)))
plt.legend()
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("ERROR PLOTS")
plt.grid()
plt.show()
THE ACCURACY OBTAINED FOR TFIDFW2V DATA IS 59%
from prettytable import PrettyTable
x = PrettyTable()
x.field_names = ["Vectorizer","Model","Hyperparameter1","Hyperparameter2","AUC"]
x.add_row(["BOW","XGBoost","estimators","lrate","0.66"])
x.add_row(["TFIDF","XGBoost","estimators","lrate","0.67"])
x.add_row(["TFIDFW2V","XGBoost","estimators","lrate","0.59"])
print(x)
+------------+---------+-----------------+-----------------+------+ | Vectorizer | Model | Hyperparameter1 | Hyperparameter2 | AUC | +------------+---------+-----------------+-----------------+------+ | BOW | XGBoost | estimators | lrate | 0.66 | | TFIDF | XGBoost | estimators | lrate | 0.67 | | TFIDFW2V | XGBoost | estimators | lrate | 0.59 | +------------+---------+-----------------+-----------------+------+
CONFUSION MATRIX FOR HIGHEST ACCURACY FOR XGBoost TFIDF
def find_best_threshold(threshould, fpr, tpr):
t = threshould[np.argmax(tpr*(1-fpr))]
print("the maximum value of tpr*(1-fpr)", max(tpr*(1-fpr)), "for threshold", np.round(t,3))
return t
def predict_with_best_t(proba, threshould):
predictions = []
for i in proba:
if i[1] >= threshould:
predictions.append(1)
else:
predictions.append(0)
return predictions
print("="*100)
from sklearn.metrics import confusion_matrix
best_t = find_best_threshold(tr_thresholds1, train_fpr1, train_tpr1)
print("Test confusion matrix")
confu_mat = confusion_matrix(y_test, predict_with_best_t(test_pred1, best_t))
sns.heatmap(confu_mat,annot = True)
==================================================================================================== the maximum value of tpr*(1-fpr) 0.43640402778172227 for threshold 0.616 Test confusion matrix
<AxesSubplot:>
Applying BAG OF WORDS vectorized data to LOGISTIC REGRESSION
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0)
clf.fit(X_tr,y_train)
train_pred2 = clf.predict_proba(X_tr)
test_pred = clf.predict_proba(X_te)
train_fpr, train_tpr, tr_thresholds = roc_curve(y_train,np.transpose(train_pred2)[1])
test_fpr, test_tpr, te_thresholds = roc_curve(y_test,np.transpose(test_pred)[1])
plt.plot(train_fpr, train_tpr, label="train AUC ="+str(auc(train_fpr, train_tpr)))
plt.plot(test_fpr, test_tpr, label="test AUC ="+str(auc(test_fpr, test_tpr)))
plt.legend()
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("ERROR PLOTS")
plt.grid()
plt.show()
ACCURACY OBTAINED FOR BOW FOR LOGISTIC REGRESSION 0.677 OR 68%
APPLYING TFIDF VECTORIZED DATA TO LOGISTIC REGRESSION
clf = LogisticRegression(random_state=0)
clf.fit(X_tr_tfidf,y_train)
train_pred1 = clf.predict_proba(X_tr_tfidf)
test_pred1 = clf.predict_proba(X_te_tfidf)
train_fpr1, train_tpr1, tr_thresholds1 = roc_curve(y_train,np.transpose(train_pred1)[1])
test_fpr, test_tpr, te_thresholds = roc_curve(y_test,np.transpose(test_pred1)[1])
plt.plot(train_fpr1, train_tpr1, label="train AUC ="+str(auc(train_fpr1, train_tpr1)))
plt.plot(test_fpr, test_tpr, label="test AUC ="+str(auc(test_fpr, test_tpr)))
plt.legend()
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("ERROR PLOTS")
plt.grid()
plt.show()
THE ACCURACY OBTAINED FOR TFIDF IS 0.728 OR 73%
clf = LogisticRegression(random_state=0)
clf.fit(X_tr_tfidfw2v,y_train)
train_pred2 = clf.predict_proba(X_tr_tfidfw2v)
test_pred2 = clf.predict_proba(X_te_tfidfw2v)
train_fpr2, train_tpr2, tr_thresholds2 = roc_curve(y_train,np.transpose(train_pred2)[1])
test_fpr, test_tpr, te_thresholds = roc_curve(y_test,np.transpose(test_pred2)[1])
plt.plot(train_fpr2, train_tpr2, label="train AUC ="+str(auc(train_fpr2, train_tpr2)))
plt.plot(test_fpr, test_tpr, label="test AUC ="+str(auc(test_fpr, test_tpr)))
plt.legend()
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("ERROR PLOTS")
plt.grid()
plt.show()
THE ACCURACY OBTAINED FOR TFIDFW2V VECTORIZED DATA IS 0.59 or 60%
from prettytable import PrettyTable
x = PrettyTable()
x.field_names = ["Vectorizer","Model","AUC"]
x.add_row(["BOW","Logistic Regression","0.68"])
x.add_row(["TFIDF","Logistic Regression","0.73"])
x.add_row(["TFIDFW2V","Logistic Regression","0.60"])
print(x)
+------------+---------------------+------+ | Vectorizer | Model | AUC | +------------+---------------------+------+ | BOW | Logistic Regression | 0.68 | | TFIDF | Logistic Regression | 0.73 | | TFIDFW2V | Logistic Regression | 0.60 | +------------+---------------------+------+
from prettytable import PrettyTable
x = PrettyTable()
x.field_names = ["Vectorizer","Model","Hyperparameter1","Hyperparameter2","AUC"]
x.add_row(["BOW","XGBoost","estimators","lrate","0.66"])
x.add_row(["TFIDF","XGBoost","estimators","lrate","0.67"])
x.add_row(["TFIDFW2V","XGBoost","estimators","lrate","0.59"])
x.add_row(["BOW","Logistic Regression","alpha","None","0.68"])
x.add_row(["TFIDF","Logistic Regression","alpha","None","0.73"])
x.add_row(["TFIDFW2V","Logistic Regression","alpha","None","0.60"])
x.add_row(["BOW","Naive Bayes","alpha","None","0.70"])
x.add_row(["TFIDF","Naive Bayes","alpha","None","0.67"])
x.add_row(["TFIDFW2V","Naive Bayes","alpha","None","0.58"])
print(x)
+------------+---------------------+-----------------+-----------------+------+ | Vectorizer | Model | Hyperparameter1 | Hyperparameter2 | AUC | +------------+---------------------+-----------------+-----------------+------+ | BOW | XGBoost | estimators | lrate | 0.66 | | TFIDF | XGBoost | estimators | lrate | 0.67 | | TFIDFW2V | XGBoost | estimators | lrate | 0.59 | | BOW | Logistic Regression | alpha | None | 0.68 | | TFIDF | Logistic Regression | alpha | None | 0.73 | | TFIDFW2V | Logistic Regression | alpha | None | 0.60 | | BOW | Naive Bayes | alpha | None | 0.70 | | TFIDF | Naive Bayes | alpha | None | 0.67 | | TFIDFW2V | Naive Bayes | alpha | None | 0.58 | +------------+---------------------+-----------------+-----------------+------+
THE FINAL CONCLUSION IS "USING "TFIDF" AS THE VECTORIZER AND "LOGISTIC REGRESSION" WE GET THE HIGHEST ACCURACY OF PREDICTING THE DONOR CHOOSE PROBLEM"
#ADDITIONAL INFORMATION ASKED BY THE INSTRUCTOR
#COMPUTING F1_SCORE, CONFUSION MATRIX FOR THE BEST MODEL
clf = LogisticRegression(random_state=0)
clf.fit(X_tr_tfidf,y_train)
train_pred1 = clf.predict_proba(X_tr_tfidf)
test_pred1 = clf.predict_proba(X_te_tfidf)
train_fpr1, train_tpr1, tr_thresholds1 = roc_curve(y_train,np.transpose(train_pred1)[1])
test_fpr, test_tpr, te_thresholds = roc_curve(y_test,np.transpose(test_pred1)[1])
plt.plot(train_fpr1, train_tpr1, label="train AUC ="+str(auc(train_fpr1, train_tpr1)))
plt.plot(test_fpr, test_tpr, label="test AUC ="+str(auc(test_fpr, test_tpr)))
plt.legend()
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("ERROR PLOTS")
plt.grid()
plt.show()
def find_best_threshold(threshould, fpr, tpr):
t = threshould[np.argmax(tpr*(1-fpr))]
print("the maximum value of tpr*(1-fpr)", max(tpr*(1-fpr)), "for threshold", np.round(t,3))
return t
def predict_with_best_t(proba, threshould):
predictions = []
for i in proba:
if i[1] >= threshould:
predictions.append(1)
else:
predictions.append(0)
return predictions
print("="*100)
from sklearn.metrics import confusion_matrix
best_t = find_best_threshold(tr_thresholds1, train_fpr1, train_tpr1)
print("Test confusion matrix")
confu_mat = confusion_matrix(y_test, predict_with_best_t(test_pred1, best_t))
sns.heatmap(confu_mat,annot = True)
==================================================================================================== the maximum value of tpr*(1-fpr) 0.5548813423204239 for threshold 0.671 Test confusion matrix
<AxesSubplot:>
TP = 9e+03
FP = 2e+03
FN = 3.6e+03
F1_SCORE = TP/(TP + (0.5*(FP+FN)))
print("F1_SCORE is {0}".format(F1_SCORE))
F1_SCORE is 0.7627118644067796